527 files changed, 14248 insertions, 6936 deletions
diff --git a/llvm/lib/Analysis/CallPrinter.cpp b/llvm/lib/Analysis/CallPrinter.cpp
index 672dae1..99d8b11 100644
--- a/llvm/lib/Analysis/CallPrinter.cpp
+++ b/llvm/lib/Analysis/CallPrinter.cpp
@@ -70,7 +70,7 @@ public:
 
     for (Function &F : M->getFunctionList()) {
       uint64_t localSumFreq = 0;
-      SmallSet<Function *, 16> Callers;
+      SmallPtrSet<Function *, 16> Callers;
       for (User *U : F.users())
         if (isa<CallInst>(U))
           Callers.insert(cast<Instruction>(U)->getFunction());
@@ -99,7 +99,7 @@ private:
 
       bool FoundParallelEdge = true;
       while (FoundParallelEdge) {
-        SmallSet<Function *, 16> Visited;
+        SmallPtrSet<Function *, 16> Visited;
         FoundParallelEdge = false;
         for (auto CI = Node->begin(), CE = Node->end(); CI != CE; CI++) {
           if (!(Visited.insert(CI->second->getFunction())).second) {
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index 076f417..b6acda3 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -359,6 +359,12 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
   case Instruction::AddrSpaceCast:
     // The original value is not captured via this if the new value isn't.
     return UseCaptureInfo::passthrough();
+  case Instruction::PtrToAddr:
+    // We treat ptrtoaddr as a location-independent capture of the address even
+    // if it is ultimately not used. Continuing recursive analysis after
+    // ptrtoaddr would be possible, but we'd need logic to do that correctly,
+    // which is not the same as the current pointer following logic.
+    return CaptureComponents::Address;
   case Instruction::ICmp: {
     unsigned Idx = U.getOperandNo();
     unsigned OtherIdx = 1 - Idx;
@@ -399,7 +405,7 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
 
   SmallVector<const Use *, 20> Worklist;
   Worklist.reserve(getDefaultMaxUsesToExploreForCaptureTracking());
-  SmallSet<const Use *, 20> Visited;
+  SmallPtrSet<const Use *, 20> Visited;
 
   auto AddUses = [&](const Value *V) {
     for (const Use &U : V->uses()) {
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index dd98b62..f44937a 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1485,6 +1485,9 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
   switch (Opcode) {
   default:
     llvm_unreachable("Missing case");
+  case Instruction::PtrToAddr:
+    // TODO: Add some of the ptrtoint folds here as well.
+    break;
   case Instruction::PtrToInt:
     if (auto *CE = dyn_cast<ConstantExpr>(C)) {
       Constant *FoldedValue = nullptr;
@@ -1659,6 +1662,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::aarch64_sve_convert_from_svbool:
   case Intrinsic::wasm_alltrue:
   case Intrinsic::wasm_anytrue:
+  case Intrinsic::wasm_dot:
   // WebAssembly float semantics are always known
   case Intrinsic::wasm_trunc_signed:
   case Intrinsic::wasm_trunc_unsigned:
@@ -3989,6 +3993,30 @@ static Constant *ConstantFoldFixedVectorCall(
     }
     return ConstantVector::get(Result);
   }
+  case Intrinsic::wasm_dot: {
+    unsigned NumElements =
+        cast<FixedVectorType>(Operands[0]->getType())->getNumElements();
+
+    assert(NumElements == 8 && Result.size() == 4 &&
+           "wasm dot takes i16x8 and produces i32x4");
+    assert(Ty->isIntegerTy());
+    int32_t MulVector[8];
+
+    for (unsigned I = 0; I < NumElements; ++I) {
+      ConstantInt *Elt0 =
+          cast<ConstantInt>(Operands[0]->getAggregateElement(I));
+      ConstantInt *Elt1 =
+          cast<ConstantInt>(Operands[1]->getAggregateElement(I));
+
+      MulVector[I] = Elt0->getSExtValue() * Elt1->getSExtValue();
+    }
+    for (unsigned I = 0; I < Result.size(); I++) {
+      int64_t IAdd = (int64_t)MulVector[I * 2] + (int64_t)MulVector[I * 2 + 1];
+      Result[I] = ConstantInt::get(Ty, IAdd);
+    }
+
+    return ConstantVector::get(Result);
+  }
   default:
     break;
   }
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 629fa7cd..3a70666 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/DXILABI.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <optional>
@@ -29,20 +30,6 @@
 using namespace llvm;
 using namespace dxil;
 
-static StringRef getResourceClassName(ResourceClass RC) {
-  switch (RC) {
-  case ResourceClass::SRV:
-    return "SRV";
-  case ResourceClass::UAV:
-    return "UAV";
-  case ResourceClass::CBuffer:
-    return "CBuffer";
-  case ResourceClass::Sampler:
-    return "Sampler";
-  }
-  llvm_unreachable("Unhandled ResourceClass");
-}
-
 static StringRef getResourceKindName(ResourceKind RK) {
   switch (RK) {
   case ResourceKind::Texture1D:
@@ -612,7 +599,12 @@ void ResourceTypeInfo::print(raw_ostream &OS, const DataLayout &DL) const {
 
 GlobalVariable *ResourceInfo::createSymbol(Module &M, StructType *Ty) {
   assert(!Symbol && "Symbol has already been created");
-  Symbol = new GlobalVariable(M, Ty, /*isConstant=*/true,
+  Type *ResTy = Ty;
+  int64_t Size = Binding.Size;
+  if (Size != 1)
+    // unbounded arrays are represented as zero-sized arrays in LLVM IR
+    ResTy = ArrayType::get(Ty, Size == ~0u ? 0 : Size);
+  Symbol = new GlobalVariable(M, ResTy, /*isConstant=*/true,
                               GlobalValue::ExternalLinkage,
                               /*Initializer=*/nullptr, Name);
   return Symbol;
diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp
index 329bd35..761c566 100644
--- a/llvm/lib/Analysis/Delinearization.cpp
+++ b/llvm/lib/Analysis/Delinearization.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -32,6 +33,11 @@ using namespace llvm;
 #define DL_NAME "delinearize"
 #define DEBUG_TYPE DL_NAME
 
+static cl::opt<bool> UseFixedSizeArrayHeuristic(
+    "delinearize-use-fixed-size-array-heuristic", cl::init(false), cl::Hidden,
+    cl::desc("When printing analysis, use the heuristic for fixed-size arrays "
+             "if the default delinearizetion fails."));
+
 // Return true when S contains at least an undef value.
 static inline bool containsUndefs(const SCEV *S) {
   return SCEVExprContains(S, [](const SCEV *S) {
@@ -480,6 +486,184 @@ void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr,
   });
 }
 
+static std::optional<APInt> tryIntoAPInt(const SCEV *S) {
+  if (const auto *Const = dyn_cast<SCEVConstant>(S))
+    return Const->getAPInt();
+  return std::nullopt;
+}
+
+/// Collects the absolute values of constant steps for all induction variables.
+/// Returns true if we can prove that all step recurrences are constants and \p
+/// Expr is divisible by \p ElementSize. Each step recurrence is stored in \p
+/// Steps after divided by \p ElementSize.
+static bool collectConstantAbsSteps(ScalarEvolution &SE, const SCEV *Expr,
+                                    SmallVectorImpl<uint64_t> &Steps,
+                                    uint64_t ElementSize) {
+  // End of recursion. The constant value also must be a multiple of
+  // ElementSize.
+  if (const auto *Const = dyn_cast<SCEVConstant>(Expr)) {
+    const uint64_t Mod = Const->getAPInt().urem(ElementSize);
+    return Mod == 0;
+  }
+
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Expr);
+  if (!AR || !AR->isAffine())
+    return false;
+
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  std::optional<APInt> StepAPInt = tryIntoAPInt(Step);
+  if (!StepAPInt)
+    return false;
+
+  APInt Q;
+  uint64_t R;
+  APInt::udivrem(StepAPInt->abs(), ElementSize, Q, R);
+  if (R != 0)
+    return false;
+
+  // Bail out when the step is too large.
+  std::optional<uint64_t> StepVal = Q.tryZExtValue();
+  if (!StepVal)
+    return false;
+
+  Steps.push_back(*StepVal);
+  return collectConstantAbsSteps(SE, AR->getStart(), Steps, ElementSize);
+}
+
+bool llvm::findFixedSizeArrayDimensions(ScalarEvolution &SE, const SCEV *Expr,
+                                        SmallVectorImpl<uint64_t> &Sizes,
+                                        const SCEV *ElementSize) {
+  if (!ElementSize)
+    return false;
+
+  std::optional<APInt> ElementSizeAPInt = tryIntoAPInt(ElementSize);
+  if (!ElementSizeAPInt || *ElementSizeAPInt == 0)
+    return false;
+
+  std::optional<uint64_t> ElementSizeConst = ElementSizeAPInt->tryZExtValue();
+
+  // Early exit when ElementSize is not a positive constant.
+  if (!ElementSizeConst)
+    return false;
+
+  if (!collectConstantAbsSteps(SE, Expr, Sizes, *ElementSizeConst) ||
+      Sizes.empty()) {
+    Sizes.clear();
+    return false;
+  }
+
+  // At this point, Sizes contains the absolute step recurrences for all
+  // induction variables. Each step recurrence must be a multiple of the size of
+  // the array element. Assuming that the each value represents the size of an
+  // array for each dimension, attempts to restore the length of each dimension
+  // by dividing the step recurrence by the next smaller value. For example, if
+  // we have the following AddRec SCEV:
+  //
+  //   AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8)
+  //
+  // Then Sizes will become [256, 32, 1] after sorted. We don't know the size of
+  // the outermost dimension, the next dimension will be computed as 256 / 32 =
+  // 8, and the last dimension will be computed as 32 / 1 = 32. Thus it results
+  // in like Arr[UnknownSize][8][32] with elements of size 8 bytes, where Arr is
+  // a base pointer.
+  //
+  // TODO: Catch more cases, e.g., when a step recurrence is not divisible by
+  // the next smaller one, like A[i][3*j].
+  llvm::sort(Sizes.rbegin(), Sizes.rend());
+  Sizes.erase(llvm::unique(Sizes), Sizes.end());
+
+  // The last element in Sizes should be ElementSize. At this point, all values
+  // in Sizes are assumed to be divided by ElementSize, so replace it with 1.
+  assert(Sizes.back() != 0 && "Unexpected zero size in Sizes.");
+  Sizes.back() = 1;
+
+  for (unsigned I = 0; I + 1 < Sizes.size(); I++) {
+    uint64_t PrevSize = Sizes[I + 1];
+    if (Sizes[I] % PrevSize) {
+      Sizes.clear();
+      return false;
+    }
+    Sizes[I] /= PrevSize;
+  }
+
+  // Finally, the last element in Sizes should be ElementSize.
+  Sizes.back() = *ElementSizeConst;
+  return true;
+}
+
+/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
+/// sizes of an array access, assuming that the array is a fixed size array.
+///
+/// E.g., if we have the code like as follows:
+///
+///  double A[42][8][32];
+///  for i
+///    for j
+///      for k
+///        use A[i][j][k]
+///
+/// The access function will be represented as an AddRec SCEV like:
+///
+///  AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8)
+///
+/// Then findFixedSizeArrayDimensions infers the size of each dimension of the
+/// array based on the fact that the value of the step recurrence is a multiple
+/// of the size of the corresponding array element. In the above example, it
+/// results in the following:
+///
+///  CHECK: ArrayDecl[UnknownSize][8][32] with elements of 8 bytes.
+///
+/// Finally each subscript will be computed as follows:
+///
+///  CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
+///
+/// Note that this function doesn't check the range of possible values for each
+/// subscript, so the caller should perform additional boundary checks if
+/// necessary.
+///
+/// Also note that this function doesn't guarantee that the original array size
+/// is restored "correctly". For example, in the following case:
+///
+///  double A[42][4][64];
+///  double B[42][8][32];
+///  for i
+///    for j
+///      for k
+///        use A[i][j][k]
+///        use B[i][2*j][k]
+///
+/// The access function for both accesses will be the same:
+///
+///  AddRec: {{{0,+,2048}<%for.i>,+,512}<%for.j>,+,8}<%for.k> (ElementSize=8)
+///
+/// The array sizes for both A and B will be computed as
+/// ArrayDecl[UnknownSize][4][64], which matches for A, but not for B.
+///
+/// TODO: At the moment, this function can handle only simple cases. For
+/// example, we cannot handle a case where a step recurrence is not divisible
+/// by the next smaller step recurrence, e.g., A[i][3*j].
+bool llvm::delinearizeFixedSizeArray(ScalarEvolution &SE, const SCEV *Expr,
+                                     SmallVectorImpl<const SCEV *> &Subscripts,
+                                     SmallVectorImpl<const SCEV *> &Sizes,
+                                     const SCEV *ElementSize) {
+
+  // First step: find the fixed array size.
+  SmallVector<uint64_t, 4> ConstSizes;
+  if (!findFixedSizeArrayDimensions(SE, Expr, ConstSizes, ElementSize)) {
+    Sizes.clear();
+    return false;
+  }
+
+  // Convert the constant size to SCEV.
+  for (uint64_t Size : ConstSizes)
+    Sizes.push_back(SE.getConstant(Expr->getType(), Size));
+
+  // Second step: compute the access functions for each subscript.
+  computeAccessFunctions(SE, Expr, Subscripts, Sizes);
+
+  return !Subscripts.empty();
+}
+
 bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE,
                                       const GetElementPtrInst *GEP,
                                       SmallVectorImpl<const SCEV *> &Subscripts,
@@ -586,9 +770,21 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI,
       O << "AccessFunction: " << *AccessFn << "\n";
 
       SmallVector<const SCEV *, 3> Subscripts, Sizes;
+
+      auto IsDelinearizationFailed = [&]() {
+        return Subscripts.size() == 0 || Sizes.size() == 0 ||
+               Subscripts.size() != Sizes.size();
+      };
+
       delinearize(*SE, AccessFn, Subscripts, Sizes, SE->getElementSize(&Inst));
-      if (Subscripts.size() == 0 || Sizes.size() == 0 ||
-          Subscripts.size() != Sizes.size()) {
+      if (UseFixedSizeArrayHeuristic && IsDelinearizationFailed()) {
+        Subscripts.clear();
+        Sizes.clear();
+        delinearizeFixedSizeArray(*SE, AccessFn, Subscripts, Sizes,
+                                  SE->getElementSize(&Inst));
+      }
+
+      if (IsDelinearizationFailed()) {
         O << "failed to delinearize\n";
         continue;
       }
diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index 6694d5c..e088175 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -76,6 +76,26 @@ void DemandedBits::determineLiveOperandBits(
           computeKnownBits(V2, Known2, DL, &AC, UserI, &DT);
         }
       };
+  auto GetShiftedRange = [&](uint64_t Min, uint64_t Max, bool ShiftLeft) {
+    auto ShiftF = [ShiftLeft](const APInt &Mask, unsigned ShiftAmnt) {
+      return ShiftLeft ? Mask.shl(ShiftAmnt) : Mask.lshr(ShiftAmnt);
+    };
+    AB = APInt::getZero(BitWidth);
+    uint64_t LoopRange = Max - Min;
+    APInt Mask = AOut;
+    APInt Shifted = AOut; // AOut | (AOut << 1) | ... | (AOut << (ShiftAmnt - 1)
+    for (unsigned ShiftAmnt = 1; ShiftAmnt <= LoopRange; ShiftAmnt <<= 1) {
+      if (LoopRange & ShiftAmnt) {
+        // Account for (LoopRange - ShiftAmnt, LoopRange]
+        Mask |= ShiftF(Shifted, LoopRange - ShiftAmnt + 1);
+        // Clears the low bit.
+        LoopRange -= ShiftAmnt;
+      }
+      // [0, ShiftAmnt) -> [0, ShiftAmnt * 2)
+      Shifted |= ShiftF(Shifted, ShiftAmnt);
+    }
+    AB = ShiftF(Mask, Min);
+  };
 
   switch (UserI->getOpcode()) {
   default: break;
@@ -183,6 +203,17 @@ void DemandedBits::determineLiveOperandBits(
           AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
         else if (S->hasNoUnsignedWrap())
           AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+      } else {
+        ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
+        uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
+        uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
+        // similar to Lshr case
+        GetShiftedRange(Min, Max, /*ShiftLeft=*/false);
+        const auto *S = cast<ShlOperator>(UserI);
+        if (S->hasNoSignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, Max + 1);
+        else if (S->hasNoUnsignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, Max);
       }
     }
     break;
@@ -197,6 +228,24 @@ void DemandedBits::determineLiveOperandBits(
         // (they must be zero).
         if (cast<LShrOperator>(UserI)->isExact())
           AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      } else {
+        ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
+        uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
+        uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
+        // Suppose AOut == 0b0000 0001
+        // [min, max] = [1, 3]
+        // iteration 1 shift by 1 mask is 0b0000 0011
+        // iteration 2 shift by 2 mask is 0b0000 1111
+        // iteration 3, shiftAmnt = 4 > max - min, we stop.
+        //
+        // After the iterations we need one more shift by min,
+        // to move from 0b0000 1111 to --> 0b0001 1110.
+        // The loop populates the mask relative to (0,...,max-min),
+        // but we need coverage from (min, max).
+        // This is why the shift by min is needed.
+        GetShiftedRange(Min, Max, /*ShiftLeft=*/true);
+        if (cast<LShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, Max);
       }
     }
     break;
@@ -217,6 +266,26 @@ void DemandedBits::determineLiveOperandBits(
         // (they must be zero).
         if (cast<AShrOperator>(UserI)->isExact())
           AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      } else {
+        ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
+        uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
+        uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
+        GetShiftedRange(Min, Max, /*ShiftLeft=*/true);
+        if (Max &&
+            (AOut & APInt::getHighBitsSet(BitWidth, Max)).getBoolValue()) {
+          // Suppose AOut = 0011 1100
+          // [min, max] = [1, 3]
+          // ShiftAmount = 1 : Mask is 1000 0000
+          // ShiftAmount = 2 : Mask is 1100 0000
+          // ShiftAmount = 3 : Mask is 1110 0000
+          // The Mask with Max covers every case in [min, max],
+          // so we are done
+          AB.setSignBit();
+        }
+        // If the shift is exact, then the low bits are not dead
+        // (they must be zero).
+        if (cast<AShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, Max);
       }
     }
     break;
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 835e270..f33e04e 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1531,6 +1531,62 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) {
     return Q;
 }
 
+/// Given an affine expression of the form A*k + B, where k is an arbitrary
+/// integer, infer the possible range of k based on the known range of the
+/// affine expression. If we know A*k + B is non-negative, i.e.,
+///
+///   A*k + B >= 0
+///
+/// we can derive the following inequalities for k when A is positive:
+///
+///   k >= -B / A
+///
+/// Since k is an integer, it means k is greater than or equal to the
+/// ceil(-B / A).
+///
+/// If the upper bound of the affine expression \p UB is passed, the following
+/// inequality can be derived as well:
+///
+///   A*k + B <= UB
+///
+/// which leads to:
+///
+///   k <= (UB - B) / A
+///
+/// Again, as k is an integer, it means k is less than or equal to the
+/// floor((UB - B) / A).
+///
+/// The similar logic applies when A is negative, but the inequalities sign flip
+/// while working with them.
+///
+/// Preconditions: \p A is non-zero, and we know A*k + B is non-negative.
+static std::pair<std::optional<APInt>, std::optional<APInt>>
+inferDomainOfAffine(const APInt &A, const APInt &B,
+                    const std::optional<APInt> &UB) {
+  assert(A != 0 && "A must be non-zero");
+  std::optional<APInt> TL, TU;
+  if (A.sgt(0)) {
+    TL = ceilingOfQuotient(-B, A);
+    LLVM_DEBUG(dbgs() << "\t    Possible TL = " << *TL << "\n");
+    // New bound check - modification to Banerjee's e3 check
+    if (UB) {
+      // TODO?: Overflow check for UB - B
+      TU = floorOfQuotient(*UB - B, A);
+      LLVM_DEBUG(dbgs() << "\t    Possible TU = " << *TU << "\n");
+    }
+  } else {
+    TU = floorOfQuotient(-B, A);
+    LLVM_DEBUG(dbgs() << "\t    Possible TU = " << *TU << "\n");
+    // New bound check - modification to Banerjee's e3 check
+    if (UB) {
+      // TODO?: Overflow check for UB - B
+      TL = ceilingOfQuotient(*UB - B, A);
+      LLVM_DEBUG(dbgs() << "\t    Possible TL = " << *TL << "\n");
+    }
+  }
+  return std::make_pair(TL, TU);
+}
+
 // exactSIVtest -
 // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*i],
 // where i is an induction variable, c1 and c2 are loop invariant, and a1
@@ -1590,14 +1646,12 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   LLVM_DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
 
   // since SCEV construction normalizes, LM = 0
-  APInt UM(Bits, 1, true);
-  bool UMValid = false;
+  std::optional<APInt> UM;
   // UM is perhaps unavailable, let's check
   if (const SCEVConstant *CUB =
           collectConstantUpperBound(CurLoop, Delta->getType())) {
     UM = CUB->getAPInt();
-    LLVM_DEBUG(dbgs() << "\t    UM = " << UM << "\n");
-    UMValid = true;
+    LLVM_DEBUG(dbgs() << "\t    UM = " << *UM << "\n");
   }
 
   APInt TU(APInt::getSignedMaxValue(Bits));
@@ -1609,44 +1663,33 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   LLVM_DEBUG(dbgs() << "\t    TX = " << TX << "\n");
   LLVM_DEBUG(dbgs() << "\t    TY = " << TY << "\n");
 
-  SmallVector<APInt, 2> TLVec, TUVec;
   APInt TB = BM.sdiv(G);
-  if (TB.sgt(0)) {
-    TLVec.push_back(ceilingOfQuotient(-TX, TB));
-    LLVM_DEBUG(dbgs() << "\t    Possible TL = " << TLVec.back() << "\n");
-    // New bound check - modification to Banerjee's e3 check
-    if (UMValid) {
-      TUVec.push_back(floorOfQuotient(UM - TX, TB));
-      LLVM_DEBUG(dbgs() << "\t    Possible TU = " << TUVec.back() << "\n");
-    }
-  } else {
-    TUVec.push_back(floorOfQuotient(-TX, TB));
-    LLVM_DEBUG(dbgs() << "\t    Possible TU = " << TUVec.back() << "\n");
-    // New bound check - modification to Banerjee's e3 check
-    if (UMValid) {
-      TLVec.push_back(ceilingOfQuotient(UM - TX, TB));
-      LLVM_DEBUG(dbgs() << "\t    Possible TL = " << TLVec.back() << "\n");
-    }
-  }
-
   APInt TA = AM.sdiv(G);
-  if (TA.sgt(0)) {
-    if (UMValid) {
-      TUVec.push_back(floorOfQuotient(UM - TY, TA));
-      LLVM_DEBUG(dbgs() << "\t    Possible TU = " << TUVec.back() << "\n");
-    }
-    // New bound check - modification to Banerjee's e3 check
-    TLVec.push_back(ceilingOfQuotient(-TY, TA));
-    LLVM_DEBUG(dbgs() << "\t    Possible TL = " << TLVec.back() << "\n");
-  } else {
-    if (UMValid) {
-      TLVec.push_back(ceilingOfQuotient(UM - TY, TA));
-      LLVM_DEBUG(dbgs() << "\t    Possible TL = " << TLVec.back() << "\n");
-    }
-    // New bound check - modification to Banerjee's e3 check
-    TUVec.push_back(floorOfQuotient(-TY, TA));
-    LLVM_DEBUG(dbgs() << "\t    Possible TU = " << TUVec.back() << "\n");
-  }
+
+  // At this point, we have the following equations:
+  //
+  //   TA*i0 - TB*i1 = TC
+  //
+  // Also, we know that the all pairs of (i0, i1) can be expressed as:
+  //
+  //   (TX + k*TB, TY + k*TA)
+  //
+  // where k is an arbitrary integer.
+  auto [TL0, TU0] = inferDomainOfAffine(TB, TX, UM);
+  auto [TL1, TU1] = inferDomainOfAffine(TA, TY, UM);
+
+  auto CreateVec = [](const std::optional<APInt> &V0,
+                      const std::optional<APInt> &V1) {
+    SmallVector<APInt, 2> Vec;
+    if (V0)
+      Vec.push_back(*V0);
+    if (V1)
+      Vec.push_back(*V1);
+    return Vec;
+  };
+
+  SmallVector<APInt, 2> TLVec = CreateVec(TL0, TL1);
+  SmallVector<APInt, 2> TUVec = CreateVec(TU0, TU1);
 
   LLVM_DEBUG(dbgs() << "\t    TA = " << TA << "\n");
   LLVM_DEBUG(dbgs() << "\t    TB = " << TB << "\n");
@@ -1967,24 +2010,20 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   LLVM_DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
 
   // since SCEV construction seems to normalize, LM = 0
-  APInt SrcUM(Bits, 1, true);
-  bool SrcUMvalid = false;
+  std::optional<APInt> SrcUM;
   // SrcUM is perhaps unavailable, let's check
   if (const SCEVConstant *UpperBound =
           collectConstantUpperBound(SrcLoop, Delta->getType())) {
     SrcUM = UpperBound->getAPInt();
-    LLVM_DEBUG(dbgs() << "\t    SrcUM = " << SrcUM << "\n");
-    SrcUMvalid = true;
+    LLVM_DEBUG(dbgs() << "\t    SrcUM = " << *SrcUM << "\n");
   }
 
-  APInt DstUM(Bits, 1, true);
-  bool DstUMvalid = false;
+  std::optional<APInt> DstUM;
   // UM is perhaps unavailable, let's check
   if (const SCEVConstant *UpperBound =
           collectConstantUpperBound(DstLoop, Delta->getType())) {
     DstUM = UpperBound->getAPInt();
-    LLVM_DEBUG(dbgs() << "\t    DstUM = " << DstUM << "\n");
-    DstUMvalid = true;
+    LLVM_DEBUG(dbgs() << "\t    DstUM = " << *DstUM << "\n");
   }
 
   APInt TU(APInt::getSignedMaxValue(Bits));
@@ -1996,47 +2035,39 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   LLVM_DEBUG(dbgs() << "\t    TX = " << TX << "\n");
   LLVM_DEBUG(dbgs() << "\t    TY = " << TY << "\n");
 
-  SmallVector<APInt, 2> TLVec, TUVec;
   APInt TB = BM.sdiv(G);
-  if (TB.sgt(0)) {
-    TLVec.push_back(ceilingOfQuotient(-TX, TB));
-    LLVM_DEBUG(dbgs() << "\t    Possible TL = " << TLVec.back() << "\n");
-    if (SrcUMvalid) {
-      TUVec.push_back(floorOfQuotient(SrcUM - TX, TB));
-      LLVM_DEBUG(dbgs() << "\t    Possible TU = " << TUVec.back() << "\n");
-    }
-  } else {
-    TUVec.push_back(floorOfQuotient(-TX, TB));
-    LLVM_DEBUG(dbgs() << "\t    Possible TU = " << TUVec.back() << "\n");
-    if (SrcUMvalid) {
-      TLVec.push_back(ceilingOfQuotient(SrcUM - TX, TB));
-      LLVM_DEBUG(dbgs() << "\t    Possible TL = " << TLVec.back() << "\n");
-    }
-  }
-
   APInt TA = AM.sdiv(G);
-  if (TA.sgt(0)) {
-    TLVec.push_back(ceilingOfQuotient(-TY, TA));
-    LLVM_DEBUG(dbgs() << "\t    Possible TL = " << TLVec.back() << "\n");
-    if (DstUMvalid) {
-      TUVec.push_back(floorOfQuotient(DstUM - TY, TA));
-      LLVM_DEBUG(dbgs() << "\t    Possible TU = " << TUVec.back() << "\n");
-    }
-  } else {
-    TUVec.push_back(floorOfQuotient(-TY, TA));
-    LLVM_DEBUG(dbgs() << "\t    Possible TU = " << TUVec.back() << "\n");
-    if (DstUMvalid) {
-      TLVec.push_back(ceilingOfQuotient(DstUM - TY, TA));
-      LLVM_DEBUG(dbgs() << "\t    Possible TL = " << TLVec.back() << "\n");
-    }
-  }
 
-  if (TLVec.empty() || TUVec.empty())
-    return false;
+  // At this point, we have the following equations:
+  //
+  //   TA*i - TB*j = TC
+  //
+  // Also, we know that the all pairs of (i, j) can be expressed as:
+  //
+  //   (TX + k*TB, TY + k*TA)
+  //
+  // where k is an arbitrary integer.
+  auto [TL0, TU0] = inferDomainOfAffine(TB, TX, SrcUM);
+  auto [TL1, TU1] = inferDomainOfAffine(TA, TY, DstUM);
 
   LLVM_DEBUG(dbgs() << "\t    TA = " << TA << "\n");
   LLVM_DEBUG(dbgs() << "\t    TB = " << TB << "\n");
 
+  auto CreateVec = [](const std::optional<APInt> &V0,
+                      const std::optional<APInt> &V1) {
+    SmallVector<APInt, 2> Vec;
+    if (V0)
+      Vec.push_back(*V0);
+    if (V1)
+      Vec.push_back(*V1);
+    return Vec;
+  };
+
+  SmallVector<APInt, 2> TLVec = CreateVec(TL0, TL1);
+  SmallVector<APInt, 2> TUVec = CreateVec(TU0, TU1);
+  if (TLVec.empty() || TUVec.empty())
+    return false;
+
   TL = APIntOps::smax(TLVec.front(), TLVec.back());
   TU = APIntOps::smin(TUVec.front(), TUVec.back());
   LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
@@ -2345,6 +2376,43 @@ static std::optional<APInt> getConstantPart(const SCEV *Expr) {
   return std::nullopt;
 }
 
+bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr,
+                                               const Loop *CurLoop,
+                                               const SCEV *&CurLoopCoeff,
+                                               APInt &RunningGCD) const {
+  // If RunningGCD is already 1, exit early.
+  // TODO: It might be better to continue the recursion to find CurLoopCoeff.
+  if (RunningGCD == 1)
+    return true;
+
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+  if (!AddRec) {
+    assert(isLoopInvariant(Expr, CurLoop) &&
+           "Expected loop invariant expression");
+    return true;
+  }
+
+  assert(AddRec->isAffine() && "Unexpected Expr");
+  const SCEV *Start = AddRec->getStart();
+  const SCEV *Step = AddRec->getStepRecurrence(*SE);
+  if (AddRec->getLoop() == CurLoop) {
+    CurLoopCoeff = Step;
+  } else {
+    std::optional<APInt> ConstCoeff = getConstantPart(Step);
+
+    // If the coefficient is the product of a constant and other stuff, we can
+    // use the constant in the GCD computation.
+    if (!ConstCoeff)
+      return false;
+
+    // TODO: What happens if ConstCoeff is the "most negative" signed number
+    // (e.g. -128 for 8 bit wide APInt)?
+    RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs());
+  }
+
+  return accumulateCoefficientsGCD(Start, CurLoop, CurLoopCoeff, RunningGCD);
+}
+
 //===----------------------------------------------------------------------===//
 // gcdMIVtest -
 // Tests an MIV subscript pair for dependence.
@@ -2464,40 +2532,11 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
     RunningGCD = ExtraGCD;
     const SCEV *SrcCoeff = AddRec->getStepRecurrence(*SE);
     const SCEV *DstCoeff = SE->getMinusSCEV(SrcCoeff, SrcCoeff);
-    const SCEV *Inner = Src;
-    while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) {
-      AddRec = cast<SCEVAddRecExpr>(Inner);
-      const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
-      if (CurLoop == AddRec->getLoop())
-        ; // SrcCoeff == Coeff
-      else {
-        // If the coefficient is the product of a constant and other stuff,
-        // we can use the constant in the GCD computation.
-        std::optional<APInt> ConstCoeff = getConstantPart(Coeff);
-        if (!ConstCoeff)
-          return false;
-        RunningGCD =
-            APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs());
-      }
-      Inner = AddRec->getStart();
-    }
-    Inner = Dst;
-    while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) {
-      AddRec = cast<SCEVAddRecExpr>(Inner);
-      const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
-      if (CurLoop == AddRec->getLoop())
-        DstCoeff = Coeff;
-      else {
-        // If the coefficient is the product of a constant and other stuff,
-        // we can use the constant in the GCD computation.
-        std::optional<APInt> ConstCoeff = getConstantPart(Coeff);
-        if (!ConstCoeff)
-          return false;
-        RunningGCD =
-            APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs());
-      }
-      Inner = AddRec->getStart();
-    }
+
+    if (!accumulateCoefficientsGCD(Src, CurLoop, SrcCoeff, RunningGCD) ||
+        !accumulateCoefficientsGCD(Dst, CurLoop, DstCoeff, RunningGCD))
+      return false;
+
     Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 8be5de3..b8c540c 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -40,6 +40,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
   switch (Kind) {
   default:
     break;
+  case RecurKind::AddChainWithSubs:
+  case RecurKind::Sub:
   case RecurKind::Add:
   case RecurKind::Mul:
   case RecurKind::Or:
@@ -897,8 +899,11 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
   case Instruction::PHI:
     return InstDesc(I, Prev.getRecKind(), Prev.getExactFPMathInst());
   case Instruction::Sub:
+    return InstDesc(
+        Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs, I);
   case Instruction::Add:
-    return InstDesc(Kind == RecurKind::Add, I);
+    return InstDesc(
+        Kind == RecurKind::Add || Kind == RecurKind::AddChainWithSubs, I);
   case Instruction::Mul:
     return InstDesc(Kind == RecurKind::Mul, I);
   case Instruction::And:
@@ -917,7 +922,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
                     I->hasAllowReassoc() ? nullptr : I);
   case Instruction::Select:
     if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul ||
-        Kind == RecurKind::Add || Kind == RecurKind::Mul)
+        Kind == RecurKind::Add || Kind == RecurKind::Mul ||
+        Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs)
       return isConditionalRdxPattern(I);
     if (isFindIVRecurrenceKind(Kind) && SE)
       return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
@@ -1003,6 +1009,17 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
     LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
     return true;
   }
+  if (AddReductionVar(Phi, RecurKind::Sub, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
+    LLVM_DEBUG(dbgs() << "Found a SUB reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RecurKind::AddChainWithSubs, TheLoop, FMF, RedDes,
+                      DB, AC, DT, SE)) {
+    LLVM_DEBUG(dbgs() << "Found a chained ADD-SUB reduction PHI." << *Phi
+                      << "\n");
+    return true;
+  }
   if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT,
                       SE)) {
     LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
@@ -1201,6 +1218,9 @@ bool RecurrenceDescriptor::isFixedOrderRecurrence(PHINode *Phi, Loop *TheLoop,
 
 unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
   switch (Kind) {
+  case RecurKind::Sub:
+    return Instruction::Sub;
+  case RecurKind::AddChainWithSubs:
   case RecurKind::Add:
     return Instruction::Add;
   case RecurKind::Mul:
@@ -1288,6 +1308,10 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
     if (isFMulAddIntrinsic(Cur))
       return true;
 
+    if (Cur->getOpcode() == Instruction::Sub &&
+        Kind == RecurKind::AddChainWithSubs)
+      return true;
+
     return Cur->getOpcode() == getOpcode();
   };
 
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 22f4d08..757f689 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -180,6 +180,10 @@ static cl::opt<bool> DisableGEPConstOperand(
     "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
     cl::desc("Disables evaluation of GetElementPtr with constant operands"));
 
+static cl::opt<bool> InlineAllViableCalls(
+    "inline-all-viable-calls", cl::Hidden, cl::init(false),
+    cl::desc("Inline all viable calls, even if they exceed the inlining "
+             "threshold"));
 namespace llvm {
 std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
   if (Attr.isValid()) {
@@ -3272,6 +3276,10 @@ InlineCost llvm::getInlineCost(
     return llvm::InlineCost::getNever(UserDecision->getFailureReason());
   }
 
+  if (InlineAllViableCalls && isInlineViable(*Callee).isSuccess())
+    return llvm::InlineCost::getAlways(
+        "Inlining forced by -inline-all-viable-calls");
+
   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
                           << "... (caller:" << Call.getCaller()->getName()
                           << ")\n");
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 922f25d..c7b0ca9 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -927,8 +927,13 @@ LazyValueInfoImpl::solveBlockValueCast(CastInst *CI, BasicBlock *BB) {
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
   // more definitions.
-  return ValueLatticeElement::getRange(LHSRange.castOp(CI->getOpcode(),
-                                                       ResultBitWidth));
+  ConstantRange Res = ConstantRange::getEmpty(ResultBitWidth);
+  if (auto *Trunc = dyn_cast<TruncInst>(CI))
+    Res = LHSRange.truncate(ResultBitWidth, Trunc->getNoWrapKind());
+  else
+    Res = LHSRange.castOp(CI->getOpcode(), ResultBitWidth);
+
+  return ValueLatticeElement::getRange(Res);
 }
 
 std::optional<ValueLatticeElement>
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 78d0887..9a2c9ba 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -276,8 +276,7 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
   // this function is only used when one address use dominates the
   // other, which means that they'll always either have the same
   // value or one of them will have an undefined value.
-  if (isa<BinaryOperator>(A) || isa<CastInst>(A) || isa<PHINode>(A) ||
-      isa<GetElementPtrInst>(A))
+  if (isa<CastInst>(A) || isa<PHINode>(A) || isa<GetElementPtrInst>(A))
     if (const Instruction *BI = dyn_cast<Instruction>(B))
       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
         return true;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index a553533..bceddd0 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -936,6 +936,12 @@ private:
 static std::optional<int64_t>
 getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
                     Value *Ptr, PredicatedScalarEvolution &PSE) {
+  if (isa<ScalableVectorType>(AccessTy)) {
+    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
+                      << "\n");
+    return std::nullopt;
+  }
+
   // The access function must stride over the innermost loop.
   if (Lp != AR->getLoop()) {
     LLVM_DEBUG({
@@ -1590,11 +1596,6 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
     return 0;
 
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
-  if (isa<ScalableVectorType>(AccessTy)) {
-    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
-                      << "\n");
-    return std::nullopt;
-  }
 
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (Assume && !AR)
@@ -2404,12 +2405,13 @@ bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
 SmallVector<Instruction *, 4>
 MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
   MemAccessInfo Access(Ptr, IsWrite);
-  auto &IndexVector = Accesses.find(Access)->second;
-
+  auto I = Accesses.find(Access);
   SmallVector<Instruction *, 4> Insts;
-  transform(IndexVector,
-                 std::back_inserter(Insts),
-                 [&](unsigned Idx) { return this->InstMap[Idx]; });
+  if (I != Accesses.end()) {
+    transform(I->second, std::back_inserter(Insts),
+              [&](unsigned Idx) { return this->InstMap[Idx]; });
+  }
+
   return Insts;
 }
 
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 518a634..6ba6073 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -58,14 +58,26 @@ static cl::opt<bool, true>
 // Loop implementation
 //
 
-bool Loop::isLoopInvariant(const Value *V) const {
-  if (const Instruction *I = dyn_cast<Instruction>(V))
-    return !contains(I);
+bool Loop::isLoopInvariant(const Value *V, bool HasCoroSuspendInst) const {
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // FIXME: this is semantically inconsistent. We're tracking a proper fix in
+    // issue #149604.
+    // If V is a pointer to stack object and L contains a coro.suspend function
+    // call, then V may not be loop invariant because the ramp function and
+    // resume function have different stack frames.
+    if (HasCoroSuspendInst && isa<AllocaInst>(I))
+      return false;
+    else
+      return !contains(I);
+  }
   return true; // All non-instructions are loop invariant
 }
 
-bool Loop::hasLoopInvariantOperands(const Instruction *I) const {
-  return all_of(I->operands(), [this](Value *V) { return isLoopInvariant(V); });
+bool Loop::hasLoopInvariantOperands(const Instruction *I,
+                                    bool HasCoroSuspendInst) const {
+  return all_of(I->operands(), [&](Value *V) {
+    return isLoopInvariant(V, HasCoroSuspendInst);
+  });
 }
 
 bool Loop::makeLoopInvariant(Value *V, bool &Changed, Instruction *InsertPt,
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 2b0f212..67c2cfa 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -150,6 +150,10 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
+      Loc = MemoryLocation::getForArgument(II, 0, TLI);
+      // These intrinsics don't really modify the memory, but returning Mod
+      // will allow them to be handled conservatively.
+      return ModRefInfo::Mod;
     case Intrinsic::invariant_start:
       Loc = MemoryLocation::getForArgument(II, 1, TLI);
       // These intrinsics don't really modify the memory, but returning Mod
@@ -441,11 +445,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       Intrinsic::ID ID = II->getIntrinsicID();
       switch (ID) {
       case Intrinsic::lifetime_start: {
-        // FIXME: This only considers queries directly on the invariant-tagged
-        // pointer, not on query pointers that are indexed off of them.  It'd
-        // be nice to handle that at some point (the right approach is to use
-        // GetPointerBaseWithConstantOffset).
-        MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(1));
+        MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(0));
         if (BatchAA.isMustAlias(ArgLoc, MemLoc))
           return MemDepResult::getDef(II);
         continue;
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 28a2640..72b643c 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -191,7 +191,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
 
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end: {
-      assert(ArgIdx == 1 && "Invalid argument index");
+      assert(ArgIdx == 0 && "Invalid argument index");
       auto *AI = dyn_cast<AllocaInst>(Arg);
       if (!AI)
         // lifetime of poison value.
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 477e477..d2c445f 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -7284,7 +7284,7 @@ ScalarEvolution::getDefiningScopeBound(ArrayRef<const SCEV *> Ops,
                                        bool &Precise) {
   Precise = true;
   // Do a bounded search of the def relation of the requested SCEVs.
-  SmallSet<const SCEV *, 16> Visited;
+  SmallPtrSet<const SCEV *, 16> Visited;
   SmallVector<const SCEV *> Worklist;
   auto pushOp = [&](const SCEV *S) {
     if (!Visited.insert(S).second)
@@ -7435,7 +7435,15 @@ ScalarEvolution::getLoopProperties(const Loop *L) {
       if (auto *SI = dyn_cast<StoreInst>(I))
         return !SI->isSimple();
 
-      return I->mayThrow() || I->mayWriteToMemory();
+      if (I->mayThrow())
+        return true;
+
+      // Non-volatile memset / memcpy do not count as side-effect for forward
+      // progress.
+      if (isa<MemIntrinsic>(I) && !I->isVolatile())
+        return false;
+
+      return I->mayWriteToMemory();
     };
 
     LoopProperties LP = {/* HasNoAbnormalExits */ true,
@@ -14944,6 +14952,29 @@ const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates(
   if (!AddRec)
     return nullptr;
 
+  // Check if any of the transformed predicates is known to be false. In that
+  // case, it doesn't make sense to convert to a predicated AddRec, as the
+  // versioned loop will never execute.
+  for (const SCEVPredicate *Pred : TransformPreds) {
+    auto *WrapPred = dyn_cast<SCEVWrapPredicate>(Pred);
+    if (!WrapPred || WrapPred->getFlags() != SCEVWrapPredicate::IncrementNSSW)
+      continue;
+
+    const SCEVAddRecExpr *AddRecToCheck = WrapPred->getExpr();
+    const SCEV *ExitCount = getBackedgeTakenCount(AddRecToCheck->getLoop());
+    if (isa<SCEVCouldNotCompute>(ExitCount))
+      continue;
+
+    const SCEV *Step = AddRecToCheck->getStepRecurrence(*this);
+    if (!Step->isOne())
+      continue;
+
+    ExitCount = getTruncateOrSignExtend(ExitCount, Step->getType());
+    const SCEV *Add = getAddExpr(AddRecToCheck->getStart(), ExitCount);
+    if (isKnownPredicate(CmpInst::ICMP_SLT, Add, AddRecToCheck->getStart()))
+      return nullptr;
+  }
+
   // Since the transformation was successful, we can now transfer the SCEV
   // predicates.
   Preds.append(TransformPreds.begin(), TransformPreds.end());
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index abe4985..1e20fca 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -70,7 +70,7 @@ void StackLifetime::collectMarkers() {
       const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
       if (!II || !II->isLifetimeStartOrEnd())
         continue;
-      const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+      const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0));
       if (!AI)
         continue;
       auto It = AllocaNumbering.find(AI);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c7eb2ec..323ab8b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1130,6 +1130,15 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getIndexedVectorInstrCostFromEnd(
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
+    unsigned Index) const {
+  InstructionCost Cost =
+      TTIImpl->getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getInsertExtractValueCost(
     unsigned Opcode, TTI::TargetCostKind CostKind) const {
   assert((Opcode == Instruction::InsertValue ||
@@ -1230,10 +1239,11 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
   return TTIImpl->getNumberOfParts(Tp);
 }
 
-InstructionCost
-TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
-                                               const SCEV *Ptr) const {
-  InstructionCost Cost = TTIImpl->getAddressComputationCost(Tp, SE, Ptr);
+InstructionCost TargetTransformInfo::getAddressComputationCost(
+    Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+    TTI::TargetCostKind CostKind) const {
+  InstructionCost Cost =
+      TTIImpl->getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1e70228..21bdb2f 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6356,27 +6356,6 @@ llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
   return nullptr;
 }
 
-bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP,
-                                       unsigned CharSize) {
-  // Make sure the GEP has exactly three arguments.
-  if (GEP->getNumOperands() != 3)
-    return false;
-
-  // Make sure the index-ee is a pointer to array of \p CharSize integers.
-  // CharSize.
-  ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType());
-  if (!AT || !AT->getElementType()->isIntegerTy(CharSize))
-    return false;
-
-  // Check to make sure that the first operand of the GEP is an integer and
-  // has value 0 so that we are sure we're indexing into the initializer.
-  const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
-  if (!FirstIdx || !FirstIdx->isZero())
-    return false;
-
-  return true;
-}
-
 // If V refers to an initialized global constant, set Slice either to
 // its initializer if the size of its elements equals ElementSize, or,
 // for ElementSize == 8, to its representation as an array of unsiged
@@ -7415,8 +7394,10 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind,
       case Intrinsic::fshr:
       case Intrinsic::smax:
       case Intrinsic::smin:
+      case Intrinsic::scmp:
       case Intrinsic::umax:
       case Intrinsic::umin:
+      case Intrinsic::ucmp:
       case Intrinsic::ptrmask:
       case Intrinsic::fptoui_sat:
       case Intrinsic::fptosi_sat:
@@ -7785,7 +7766,7 @@ bool llvm::mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
 
   // The set of all recursive users we've visited (which are assumed to all be
   // poison because of said visit)
-  SmallSet<const Value *, 16> KnownPoison;
+  SmallPtrSet<const Value *, 16> KnownPoison;
   SmallVector<const Instruction*, 16> Worklist;
   Worklist.push_back(Root);
   while (!Worklist.empty()) {
@@ -8140,8 +8121,8 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
 
   // Set of instructions that we have proved will yield poison if Inst
   // does.
-  SmallSet<const Value *, 16> YieldsPoison;
-  SmallSet<const BasicBlock *, 4> Visited;
+  SmallPtrSet<const Value *, 16> YieldsPoison;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
 
   YieldsPoison.insert(V);
   Visited.insert(BB);
@@ -9147,7 +9128,8 @@ static bool matchTwoInputRecurrence(const PHINode *PN, InstTy *&Inst,
     return false;
 
   for (unsigned I = 0; I != 2; ++I) {
-    if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I))) {
+    if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I));
+        Operation && Operation->getNumOperands() >= 2) {
       Value *LHS = Operation->getOperand(0);
       Value *RHS = Operation->getOperand(1);
       if (LHS != PN && RHS != PN)
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 520c6a0..3d5bd61 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -928,6 +928,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(fptoui,      FPToUI);
   INSTKEYWORD(fptosi,      FPToSI);
   INSTKEYWORD(inttoptr,    IntToPtr);
+  INSTKEYWORD(ptrtoaddr,   PtrToAddr);
   INSTKEYWORD(ptrtoint,    PtrToInt);
   INSTKEYWORD(bitcast,     BitCast);
   INSTKEYWORD(addrspacecast, AddrSpaceCast);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 13bef1f..1bc2906 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4273,6 +4273,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
   case lltok::kw_bitcast:
   case lltok::kw_addrspacecast:
   case lltok::kw_inttoptr:
+  case lltok::kw_ptrtoaddr:
   case lltok::kw_ptrtoint: {
     unsigned Opc = Lex.getUIntVal();
     Type *DestTy = nullptr;
@@ -7310,6 +7311,7 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_fptoui:
   case lltok::kw_fptosi:
   case lltok::kw_inttoptr:
+  case lltok::kw_ptrtoaddr:
   case lltok::kw_ptrtoint:
     return parseCast(Inst, PFS, KeywordVal);
   case lltok::kw_fptrunc:
diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp
index eb83945..36d10d0 100644
--- a/llvm/lib/BinaryFormat/DXContainer.cpp
+++ b/llvm/lib/BinaryFormat/DXContainer.cpp
@@ -60,17 +60,6 @@ ArrayRef<EnumEntry<SigComponentType>> dxbc::getSigComponentTypes() {
   return ArrayRef(SigComponentTypes);
 }
 
-static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
-    {"SRV", llvm::dxil::ResourceClass::SRV},
-    {"UAV", llvm::dxil::ResourceClass::UAV},
-    {"CBV", llvm::dxil::ResourceClass::CBuffer},
-    {"Sampler", llvm::dxil::ResourceClass::Sampler},
-};
-
-ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> dxbc::getResourceClasses() {
-  return ArrayRef(ResourceClassNames);
-}
-
 static const EnumEntry<RootFlags> RootFlagNames[] = {
 #define ROOT_SIGNATURE_FLAG(Val, Enum) {#Enum, RootFlags::Enum},
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/BinaryFormat/MsgPackDocument.cpp b/llvm/lib/BinaryFormat/MsgPackDocument.cpp
index 11598ee..b52f029 100644
--- a/llvm/lib/BinaryFormat/MsgPackDocument.cpp
+++ b/llvm/lib/BinaryFormat/MsgPackDocument.cpp
@@ -104,6 +104,10 @@ DocNode &DocNode::operator=(uint64_t Val) {
   *this = getDocument()->getNode(Val);
   return *this;
 }
+DocNode &DocNode::operator=(double Val) {
+  *this = getDocument()->getNode(Val);
+  return *this;
+}
 
 // A level in the document reading stack.
 struct StackLevel {
@@ -293,6 +297,9 @@ void Document::writeToBlob(std::string &Blob) {
     case Type::Binary:
       MPWriter.write(Node.getBinary());
       break;
+    case Type::Float:
+      MPWriter.write(Node.getFloat());
+      break;
     case Type::Empty:
       llvm_unreachable("unhandled empty msgpack node");
     default:
diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp
index f1765d7..8076a26 100644
--- a/llvm/lib/BinaryFormat/SFrame.cpp
+++ b/llvm/lib/BinaryFormat/SFrame.cpp
@@ -68,3 +68,11 @@ ArrayRef<EnumEntry<sframe::FREOffset>> sframe::getFREOffsets() {
   };
   return ArrayRef(FREOffsets);
 }
+
+ArrayRef<EnumEntry<sframe::BaseReg>> sframe::getBaseRegisters() {
+  static constexpr EnumEntry<sframe::BaseReg> BaseRegs[] = {
+      {"FP", sframe::BaseReg::FP},
+      {"SP", sframe::BaseReg::SP},
+  };
+  return ArrayRef(BaseRegs);
+}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 290d873..22a0d0f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1283,6 +1283,7 @@ static int getDecodedCastOpcode(unsigned Val) {
   case bitc::CAST_SITOFP  : return Instruction::SIToFP;
   case bitc::CAST_FPTRUNC : return Instruction::FPTrunc;
   case bitc::CAST_FPEXT   : return Instruction::FPExt;
+  case bitc::CAST_PTRTOADDR: return Instruction::PtrToAddr;
   case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
   case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
   case bitc::CAST_BITCAST : return Instruction::BitCast;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 05680fa..a3f8254 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -647,6 +647,7 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) {
   case Instruction::SIToFP  : return bitc::CAST_SITOFP;
   case Instruction::FPTrunc : return bitc::CAST_FPTRUNC;
   case Instruction::FPExt   : return bitc::CAST_FPEXT;
+  case Instruction::PtrToAddr: return bitc::CAST_PTRTOADDR;
   case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
   case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
   case Instruction::BitCast : return bitc::CAST_BITCAST;
diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp
new file mode 100644
index 0000000..73646ad
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinCAS.cpp
@@ -0,0 +1,94 @@
+//===- BuiltinCAS.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/Support/Process.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+static StringRef getCASIDPrefix() { return "llvmcas://"; }
+void BuiltinCASContext::anchor() {}
+
+Expected<HashType> BuiltinCASContext::parseID(StringRef Reference) {
+  if (!Reference.consume_front(getCASIDPrefix()))
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "invalid cas-id '" + Reference + "'");
+
+  // FIXME: Allow shortened references?
+  if (Reference.size() != 2 * sizeof(HashType))
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "wrong size for cas-id hash '" + Reference + "'");
+
+  std::string Binary;
+  if (!tryGetFromHex(Reference, Binary))
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "invalid hash in cas-id '" + Reference + "'");
+
+  assert(Binary.size() == sizeof(HashType));
+  HashType Digest;
+  llvm::copy(Binary, Digest.data());
+  return Digest;
+}
+
+Expected<CASID> BuiltinCAS::parseID(StringRef Reference) {
+  Expected<HashType> Digest = BuiltinCASContext::parseID(Reference);
+  if (!Digest)
+    return Digest.takeError();
+
+  return CASID::create(&getContext(), toStringRef(*Digest));
+}
+
+void BuiltinCASContext::printID(ArrayRef<uint8_t> Digest, raw_ostream &OS) {
+  SmallString<64> Hash;
+  toHex(Digest, /*LowerCase=*/true, Hash);
+  OS << getCASIDPrefix() << Hash;
+}
+
+void BuiltinCASContext::printIDImpl(raw_ostream &OS, const CASID &ID) const {
+  BuiltinCASContext::printID(ID.getHash(), OS);
+}
+
+const BuiltinCASContext &BuiltinCASContext::getDefaultContext() {
+  static BuiltinCASContext DefaultContext;
+  return DefaultContext;
+}
+
+Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,
+                                      ArrayRef<char> Data) {
+  return storeImpl(BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data),
+                   Refs, Data);
+}
+
+Error BuiltinCAS::validate(const CASID &ID) {
+  auto Ref = getReference(ID);
+  if (!Ref)
+    return createUnknownObjectError(ID);
+
+  auto Handle = load(*Ref);
+  if (!Handle)
+    return Handle.takeError();
+
+  auto Proxy = ObjectProxy::load(*this, *Ref, *Handle);
+  SmallVector<ObjectRef> Refs;
+  if (auto E = Proxy.forEachReference([&](ObjectRef Ref) -> Error {
+        Refs.push_back(Ref);
+        return Error::success();
+      }))
+    return E;
+
+  ArrayRef<char> Data(Proxy.getData().data(), Proxy.getData().size());
+  auto Hash = BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data);
+  if (!ID.getHash().equals(Hash))
+    return createCorruptObjectError(ID);
+
+  return Error::success();
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h
new file mode 100644
index 0000000..3b5374d
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinCAS.h
@@ -0,0 +1,74 @@
+//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CAS_BUILTINCAS_H
+#define LLVM_LIB_CAS_BUILTINCAS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/BuiltinCASContext.h"
+#include "llvm/CAS/ObjectStore.h"
+
+namespace llvm::cas {
+class ActionCache;
+namespace builtin {
+
+/// Common base class for builtin CAS implementations using the same CASContext.
+class BuiltinCAS : public ObjectStore {
+public:
+  BuiltinCAS() : ObjectStore(BuiltinCASContext::getDefaultContext()) {}
+
+  Expected<CASID> parseID(StringRef Reference) final;
+
+  Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs,
+                            ArrayRef<char> Data) final;
+  virtual Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+                                        ArrayRef<ObjectRef> Refs,
+                                        ArrayRef<char> Data) = 0;
+
+  virtual Expected<ObjectRef>
+  storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+                                sys::fs::mapped_file_region Map) {
+    return storeImpl(ComputedHash, {}, ArrayRef(Map.data(), Map.size()));
+  }
+
+  /// Both builtin CAS implementations provide lifetime for free, so this can
+  /// be const, and readData() and getDataSize() can be implemented on top of
+  /// it.
+  virtual ArrayRef<char> getDataConst(ObjectHandle Node) const = 0;
+
+  ArrayRef<char> getData(ObjectHandle Node,
+                         bool RequiresNullTerminator) const final {
+    // BuiltinCAS Objects are always null terminated.
+    return getDataConst(Node);
+  }
+  uint64_t getDataSize(ObjectHandle Node) const final {
+    return getDataConst(Node).size();
+  }
+
+  Error createUnknownObjectError(const CASID &ID) const {
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "unknown object '" + ID.toString() + "'");
+  }
+
+  Error createCorruptObjectError(const CASID &ID) const {
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "corrupt object '" + ID.toString() + "'");
+  }
+
+  Error createCorruptStorageError() const {
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "corrupt storage");
+  }
+
+  Error validate(const CASID &ID) final;
+};
+
+} // end namespace builtin
+} // end namespace llvm::cas
+
+#endif // LLVM_LIB_CAS_BUILTINCAS_H
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
new file mode 100644
index 0000000..b2825a1
--- /dev/null
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_llvm_component_library(LLVMCAS
+  BuiltinCAS.cpp
+  InMemoryCAS.cpp
+  ObjectStore.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
+
+  LINK_COMPONENTS
+  Support
+)
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
new file mode 100644
index 0000000..255b89c
--- /dev/null
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -0,0 +1,326 @@
+//===- InMemoryCAS.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/ADT/LazyAtomicPointer.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/TrieRawHashMap.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ThreadSafeAllocator.h"
+#include "llvm/Support/TrailingObjects.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+namespace {
+
+class InMemoryObject;
+
+/// Index of referenced IDs (map: Hash -> InMemoryObject*). Uses
+/// LazyAtomicPointer to coordinate creation of objects.
+using InMemoryIndexT =
+    ThreadSafeTrieRawHashMap<LazyAtomicPointer<const InMemoryObject>,
+                             sizeof(HashType)>;
+
+/// Values in \a InMemoryIndexT. \a InMemoryObject's point at this to access
+/// their hash.
+using InMemoryIndexValueT = InMemoryIndexT::value_type;
+
+/// Builtin InMemory CAS that stores CAS object in the memory.
+class InMemoryObject {
+public:
+  enum class Kind {
+    /// Node with refs and data.
+    RefNode,
+
+    /// Node with refs and data co-allocated.
+    InlineNode,
+
+    Max = InlineNode,
+  };
+
+  Kind getKind() const { return IndexAndKind.getInt(); }
+  const InMemoryIndexValueT &getIndex() const {
+    assert(IndexAndKind.getPointer());
+    return *IndexAndKind.getPointer();
+  }
+
+  ArrayRef<uint8_t> getHash() const { return getIndex().Hash; }
+
+  InMemoryObject() = delete;
+  InMemoryObject(InMemoryObject &&) = delete;
+  InMemoryObject(const InMemoryObject &) = delete;
+
+protected:
+  InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {}
+
+private:
+  enum Counts : int {
+    NumKindBits = 2,
+  };
+  PointerIntPair<const InMemoryIndexValueT *, NumKindBits, Kind> IndexAndKind;
+  static_assert((1U << NumKindBits) <= alignof(InMemoryIndexValueT),
+                "Kind will clobber pointer");
+  static_assert(((int)Kind::Max >> NumKindBits) == 0, "Kind will be truncated");
+
+public:
+  ArrayRef<char> getData() const;
+
+  ArrayRef<const InMemoryObject *> getRefs() const;
+};
+
+class InMemoryRefObject final : public InMemoryObject {
+public:
+  static constexpr Kind KindValue = Kind::RefNode;
+  static bool classof(const InMemoryObject *O) {
+    return O->getKind() == KindValue;
+  }
+
+  ArrayRef<const InMemoryObject *> getRefsImpl() const { return Refs; }
+  ArrayRef<const InMemoryObject *> getRefs() const { return Refs; }
+  ArrayRef<char> getDataImpl() const { return Data; }
+  ArrayRef<char> getData() const { return Data; }
+
+  static InMemoryRefObject &create(function_ref<void *(size_t Size)> Allocate,
+                                   const InMemoryIndexValueT &I,
+                                   ArrayRef<const InMemoryObject *> Refs,
+                                   ArrayRef<char> Data) {
+    void *Mem = Allocate(sizeof(InMemoryRefObject));
+    return *new (Mem) InMemoryRefObject(I, Refs, Data);
+  }
+
+private:
+  InMemoryRefObject(const InMemoryIndexValueT &I,
+                    ArrayRef<const InMemoryObject *> Refs, ArrayRef<char> Data)
+      : InMemoryObject(KindValue, I), Refs(Refs), Data(Data) {
+    assert(isAddrAligned(Align(8), this) && "Expected 8-byte alignment");
+    assert(isAddrAligned(Align(8), Data.data()) && "Expected 8-byte alignment");
+    assert(*Data.end() == 0 && "Expected null-termination");
+  }
+
+  ArrayRef<const InMemoryObject *> Refs;
+  ArrayRef<char> Data;
+};
+
+class InMemoryInlineObject final
+    : public InMemoryObject,
+      public TrailingObjects<InMemoryInlineObject, const InMemoryObject *,
+                             char> {
+public:
+  static constexpr Kind KindValue = Kind::InlineNode;
+  static bool classof(const InMemoryObject *O) {
+    return O->getKind() == KindValue;
+  }
+
+  ArrayRef<const InMemoryObject *> getRefs() const { return getRefsImpl(); }
+  ArrayRef<const InMemoryObject *> getRefsImpl() const {
+    return ArrayRef(getTrailingObjects<const InMemoryObject *>(), NumRefs);
+  }
+
+  ArrayRef<char> getData() const { return getDataImpl(); }
+  ArrayRef<char> getDataImpl() const {
+    return ArrayRef(getTrailingObjects<char>(), DataSize);
+  }
+
+  static InMemoryInlineObject &
+  create(function_ref<void *(size_t Size)> Allocate,
+         const InMemoryIndexValueT &I, ArrayRef<const InMemoryObject *> Refs,
+         ArrayRef<char> Data) {
+    void *Mem = Allocate(sizeof(InMemoryInlineObject) +
+                         sizeof(uintptr_t) * Refs.size() + Data.size() + 1);
+    return *new (Mem) InMemoryInlineObject(I, Refs, Data);
+  }
+
+  size_t numTrailingObjects(OverloadToken<const InMemoryObject *>) const {
+    return NumRefs;
+  }
+
+private:
+  InMemoryInlineObject(const InMemoryIndexValueT &I,
+                       ArrayRef<const InMemoryObject *> Refs,
+                       ArrayRef<char> Data)
+      : InMemoryObject(KindValue, I), NumRefs(Refs.size()),
+        DataSize(Data.size()) {
+    auto *BeginRefs = reinterpret_cast<const InMemoryObject **>(this + 1);
+    llvm::copy(Refs, BeginRefs);
+    auto *BeginData = reinterpret_cast<char *>(BeginRefs + NumRefs);
+    llvm::copy(Data, BeginData);
+    BeginData[Data.size()] = 0;
+  }
+  uint32_t NumRefs;
+  uint32_t DataSize;
+};
+
+/// In-memory CAS database and action cache (the latter should be separated).
+class InMemoryCAS : public BuiltinCAS {
+public:
+  Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+                                ArrayRef<ObjectRef> Refs,
+                                ArrayRef<char> Data) final;
+
+  Expected<ObjectRef>
+  storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+                                sys::fs::mapped_file_region Map) override;
+
+  CASID getID(const InMemoryIndexValueT &I) const {
+    StringRef Hash = toStringRef(I.Hash);
+    return CASID::create(&getContext(), Hash);
+  }
+  CASID getID(const InMemoryObject &O) const { return getID(O.getIndex()); }
+
+  ObjectHandle getObjectHandle(const InMemoryObject &Node) const {
+    assert(!(reinterpret_cast<uintptr_t>(&Node) & 0x1ULL));
+    return makeObjectHandle(reinterpret_cast<uintptr_t>(&Node));
+  }
+
+  Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) override {
+    return getObjectHandle(asInMemoryObject(Ref));
+  }
+
+  InMemoryIndexValueT &indexHash(ArrayRef<uint8_t> Hash) {
+    return *Index.insertLazy(
+        Hash, [](auto ValueConstructor) { ValueConstructor.emplace(nullptr); });
+  }
+
+  /// TODO: Consider callers to actually do an insert and to return a handle to
+  /// the slot in the trie.
+  const InMemoryObject *getInMemoryObject(CASID ID) const {
+    assert(ID.getContext().getHashSchemaIdentifier() ==
+               getContext().getHashSchemaIdentifier() &&
+           "Expected ID from same hash schema");
+    if (InMemoryIndexT::const_pointer P = Index.find(ID.getHash()))
+      return P->Data;
+    return nullptr;
+  }
+
+  const InMemoryObject &getInMemoryObject(ObjectHandle OH) const {
+    return *reinterpret_cast<const InMemoryObject *>(
+        (uintptr_t)OH.getInternalRef(*this));
+  }
+
+  const InMemoryObject &asInMemoryObject(ReferenceBase Ref) const {
+    uintptr_t P = Ref.getInternalRef(*this);
+    return *reinterpret_cast<const InMemoryObject *>(P);
+  }
+  ObjectRef toReference(const InMemoryObject &O) const {
+    return makeObjectRef(reinterpret_cast<uintptr_t>(&O));
+  }
+
+  CASID getID(ObjectRef Ref) const final { return getIDImpl(Ref); }
+  CASID getIDImpl(ReferenceBase Ref) const {
+    return getID(asInMemoryObject(Ref));
+  }
+
+  std::optional<ObjectRef> getReference(const CASID &ID) const final {
+    if (const InMemoryObject *Object = getInMemoryObject(ID))
+      return toReference(*Object);
+    return std::nullopt;
+  }
+
+  Expected<bool> isMaterialized(ObjectRef Ref) const final { return true; }
+
+  ArrayRef<char> getDataConst(ObjectHandle Node) const final {
+    return cast<InMemoryObject>(asInMemoryObject(Node)).getData();
+  }
+
+  InMemoryCAS() = default;
+
+private:
+  size_t getNumRefs(ObjectHandle Node) const final {
+    return getInMemoryObject(Node).getRefs().size();
+  }
+  ObjectRef readRef(ObjectHandle Node, size_t I) const final {
+    return toReference(*getInMemoryObject(Node).getRefs()[I]);
+  }
+  Error forEachRef(ObjectHandle Node,
+                   function_ref<Error(ObjectRef)> Callback) const final;
+
+  /// Index of referenced IDs (map: Hash -> InMemoryObject*). Mapped to nullptr
+  /// as a convenient way to store hashes.
+  ///
+  /// - Insert nullptr on lookups.
+  /// - InMemoryObject points back to here.
+  InMemoryIndexT Index;
+
+  ThreadSafeAllocator<BumpPtrAllocator> Objects;
+  ThreadSafeAllocator<SpecificBumpPtrAllocator<sys::fs::mapped_file_region>>
+      MemoryMaps;
+};
+
+} // end anonymous namespace
+
+ArrayRef<char> InMemoryObject::getData() const {
+  if (auto *Derived = dyn_cast<InMemoryRefObject>(this))
+    return Derived->getDataImpl();
+  return cast<InMemoryInlineObject>(this)->getDataImpl();
+}
+
+ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const {
+  if (auto *Derived = dyn_cast<InMemoryRefObject>(this))
+    return Derived->getRefsImpl();
+  return cast<InMemoryInlineObject>(this)->getRefsImpl();
+}
+
+Expected<ObjectRef>
+InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+                                           sys::fs::mapped_file_region Map) {
+  // Look up the hash in the index, initializing to nullptr if it's new.
+  ArrayRef<char> Data(Map.data(), Map.size());
+  auto &I = indexHash(ComputedHash);
+
+  // Load or generate.
+  auto Allocator = [&](size_t Size) -> void * {
+    return Objects.Allocate(Size, alignof(InMemoryObject));
+  };
+  auto Generator = [&]() -> const InMemoryObject * {
+    return &InMemoryRefObject::create(Allocator, I, {}, Data);
+  };
+  const InMemoryObject &Node =
+      cast<InMemoryObject>(I.Data.loadOrGenerate(Generator));
+
+  // Save Map if the winning node uses it.
+  if (auto *RefNode = dyn_cast<InMemoryRefObject>(&Node))
+    if (RefNode->getData().data() == Map.data())
+      new (MemoryMaps.Allocate(1)) sys::fs::mapped_file_region(std::move(Map));
+
+  return toReference(Node);
+}
+
+Expected<ObjectRef> InMemoryCAS::storeImpl(ArrayRef<uint8_t> ComputedHash,
+                                           ArrayRef<ObjectRef> Refs,
+                                           ArrayRef<char> Data) {
+  // Look up the hash in the index, initializing to nullptr if it's new.
+  auto &I = indexHash(ComputedHash);
+
+  // Create the node.
+  SmallVector<const InMemoryObject *> InternalRefs;
+  for (ObjectRef Ref : Refs)
+    InternalRefs.push_back(&asInMemoryObject(Ref));
+  auto Allocator = [&](size_t Size) -> void * {
+    return Objects.Allocate(Size, alignof(InMemoryObject));
+  };
+  auto Generator = [&]() -> const InMemoryObject * {
+    return &InMemoryInlineObject::create(Allocator, I, InternalRefs, Data);
+  };
+  return toReference(cast<InMemoryObject>(I.Data.loadOrGenerate(Generator)));
+}
+
+Error InMemoryCAS::forEachRef(ObjectHandle Handle,
+                              function_ref<Error(ObjectRef)> Callback) const {
+  auto &Node = getInMemoryObject(Handle);
+  for (const InMemoryObject *Ref : Node.getRefs())
+    if (Error E = Callback(toReference(*Ref)))
+      return E;
+  return Error::success();
+}
+
+std::unique_ptr<ObjectStore> cas::createInMemoryCAS() {
+  return std::make_unique<InMemoryCAS>();
+}
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
new file mode 100644
index 0000000..e0be50b
--- /dev/null
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -0,0 +1,162 @@
+//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <optional>
+
+using namespace llvm;
+using namespace llvm::cas;
+
+void CASContext::anchor() {}
+void ObjectStore::anchor() {}
+
+LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); }
+
+std::string CASID::toString() const {
+  std::string S;
+  raw_string_ostream(S) << *this;
+  return S;
+}
+
+static void printReferenceBase(raw_ostream &OS, StringRef Kind,
+                               uint64_t InternalRef, std::optional<CASID> ID) {
+  OS << Kind << "=" << InternalRef;
+  if (ID)
+    OS << "[" << *ID << "]";
+}
+
+void ReferenceBase::print(raw_ostream &OS, const ObjectHandle &This) const {
+  assert(this == &This);
+  printReferenceBase(OS, "object-handle", InternalRef, std::nullopt);
+}
+
+void ReferenceBase::print(raw_ostream &OS, const ObjectRef &This) const {
+  assert(this == &This);
+
+  std::optional<CASID> ID;
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  if (CAS)
+    ID = CAS->getID(This);
+#endif
+  printReferenceBase(OS, "object-ref", InternalRef, ID);
+}
+
+Expected<ObjectHandle> ObjectStore::load(ObjectRef Ref) {
+  std::optional<ObjectHandle> Handle;
+  if (Error E = loadIfExists(Ref).moveInto(Handle))
+    return std::move(E);
+  if (!Handle)
+    return createStringError(errc::invalid_argument,
+                             "missing object '" + getID(Ref).toString() + "'");
+  return *Handle;
+}
+
+std::unique_ptr<MemoryBuffer>
+ObjectStore::getMemoryBuffer(ObjectHandle Node, StringRef Name,
+                             bool RequiresNullTerminator) {
+  return MemoryBuffer::getMemBuffer(
+      toStringRef(getData(Node, RequiresNullTerminator)), Name,
+      RequiresNullTerminator);
+}
+
+void ObjectStore::readRefs(ObjectHandle Node,
+                           SmallVectorImpl<ObjectRef> &Refs) const {
+  consumeError(forEachRef(Node, [&Refs](ObjectRef Ref) -> Error {
+    Refs.push_back(Ref);
+    return Error::success();
+  }));
+}
+
+Expected<ObjectProxy> ObjectStore::getProxy(const CASID &ID) {
+  std::optional<ObjectRef> Ref = getReference(ID);
+  if (!Ref)
+    return createUnknownObjectError(ID);
+
+  return getProxy(*Ref);
+}
+
+Expected<ObjectProxy> ObjectStore::getProxy(ObjectRef Ref) {
+  std::optional<ObjectHandle> H;
+  if (Error E = load(Ref).moveInto(H))
+    return std::move(E);
+
+  return ObjectProxy::load(*this, Ref, *H);
+}
+
+Expected<std::optional<ObjectProxy>>
+ObjectStore::getProxyIfExists(ObjectRef Ref) {
+  std::optional<ObjectHandle> H;
+  if (Error E = loadIfExists(Ref).moveInto(H))
+    return std::move(E);
+  if (!H)
+    return std::nullopt;
+  return ObjectProxy::load(*this, Ref, *H);
+}
+
+Error ObjectStore::createUnknownObjectError(const CASID &ID) {
+  return createStringError(std::make_error_code(std::errc::invalid_argument),
+                           "unknown object '" + ID.toString() + "'");
+}
+
+Expected<ObjectProxy> ObjectStore::createProxy(ArrayRef<ObjectRef> Refs,
+                                               StringRef Data) {
+  Expected<ObjectRef> Ref = store(Refs, arrayRefFromStringRef<char>(Data));
+  if (!Ref)
+    return Ref.takeError();
+  return getProxy(*Ref);
+}
+
+Expected<ObjectRef>
+ObjectStore::storeFromOpenFileImpl(sys::fs::file_t FD,
+                                   std::optional<sys::fs::file_status> Status) {
+  // TODO: For the on-disk CAS implementation use cloning to store it as a
+  // standalone file if the file-system supports it and the file is large.
+  uint64_t Size = Status ? Status->getSize() : -1;
+  auto Buffer = MemoryBuffer::getOpenFile(FD, /*Filename=*/"", Size);
+  if (!Buffer)
+    return errorCodeToError(Buffer.getError());
+
+  return store({}, arrayRefFromStringRef<char>((*Buffer)->getBuffer()));
+}
+
+Error ObjectStore::validateTree(ObjectRef Root) {
+  SmallDenseSet<ObjectRef> ValidatedRefs;
+  SmallVector<ObjectRef, 16> RefsToValidate;
+  RefsToValidate.push_back(Root);
+
+  while (!RefsToValidate.empty()) {
+    ObjectRef Ref = RefsToValidate.pop_back_val();
+    auto [I, Inserted] = ValidatedRefs.insert(Ref);
+    if (!Inserted)
+      continue; // already validated.
+    if (Error E = validate(getID(Ref)))
+      return E;
+    Expected<ObjectHandle> Obj = load(Ref);
+    if (!Obj)
+      return Obj.takeError();
+    if (Error E = forEachRef(*Obj, [&RefsToValidate](ObjectRef R) -> Error {
+          RefsToValidate.push_back(R);
+          return Error::success();
+        }))
+      return E;
+  }
+  return Error::success();
+}
+
+std::unique_ptr<MemoryBuffer>
+ObjectProxy::getMemoryBuffer(StringRef Name,
+                             bool RequiresNullTerminator) const {
+  return CAS->getMemoryBuffer(H, Name, RequiresNullTerminator);
+}
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index a561830..a943297 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(FileCheck)
 add_subdirectory(InterfaceStub)
 add_subdirectory(IRPrinter)
 add_subdirectory(IRReader)
+add_subdirectory(CAS)
 add_subdirectory(CGData)
 add_subdirectory(CodeGen)
 add_subdirectory(CodeGenTypes)
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index e7b9417..2ef96cc 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -69,18 +69,10 @@ unsigned llvm::ComputeLinearIndex(Type *Ty,
   return CurIndex + 1;
 }
 
-/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
-/// EVTs that represent all the individual underlying
-/// non-aggregate types that comprise it.
-///
-/// If Offsets is non-null, it points to a vector to be filled in
-/// with the in-memory offsets of each of the individual values.
-///
-void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
-                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
-                           SmallVectorImpl<EVT> *MemVTs,
-                           SmallVectorImpl<TypeSize> *Offsets,
-                           TypeSize StartingOffset) {
+void llvm::ComputeValueTypes(const DataLayout &DL, Type *Ty,
+                             SmallVectorImpl<Type *> &Types,
+                             SmallVectorImpl<TypeSize> *Offsets,
+                             TypeSize StartingOffset) {
   assert((Ty->isScalableTy() == StartingOffset.isScalable() ||
           StartingOffset.isZero()) &&
          "Offset/TypeSize mismatch!");
@@ -90,15 +82,13 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     // us to support structs with scalable vectors for operations that don't
     // need offsets.
     const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr;
-    for (StructType::element_iterator EB = STy->element_begin(),
-                                      EI = EB,
+    for (StructType::element_iterator EB = STy->element_begin(), EI = EB,
                                       EE = STy->element_end();
          EI != EE; ++EI) {
       // Don't compute the element offset if we didn't get a StructLayout above.
       TypeSize EltOffset =
           SL ? SL->getElementOffset(EI - EB) : TypeSize::getZero();
-      ComputeValueVTs(TLI, DL, *EI, ValueVTs, MemVTs, Offsets,
-                      StartingOffset + EltOffset);
+      ComputeValueTypes(DL, *EI, Types, Offsets, StartingOffset + EltOffset);
     }
     return;
   }
@@ -107,21 +97,39 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     Type *EltTy = ATy->getElementType();
     TypeSize EltSize = DL.getTypeAllocSize(EltTy);
     for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
-      ComputeValueVTs(TLI, DL, EltTy, ValueVTs, MemVTs, Offsets,
-                      StartingOffset + i * EltSize);
+      ComputeValueTypes(DL, EltTy, Types, Offsets,
+                        StartingOffset + i * EltSize);
     return;
   }
   // Interpret void as zero return values.
   if (Ty->isVoidTy())
     return;
-  // Base case: we can get an EVT for this LLVM IR type.
-  ValueVTs.push_back(TLI.getValueType(DL, Ty));
-  if (MemVTs)
-    MemVTs->push_back(TLI.getMemValueType(DL, Ty));
+  Types.push_back(Ty);
   if (Offsets)
     Offsets->push_back(StartingOffset);
 }
 
+/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
+/// EVTs that represent all the individual underlying
+/// non-aggregate types that comprise it.
+///
+/// If Offsets is non-null, it points to a vector to be filled in
+/// with the in-memory offsets of each of the individual values.
+///
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<EVT> *MemVTs,
+                           SmallVectorImpl<TypeSize> *Offsets,
+                           TypeSize StartingOffset) {
+  SmallVector<Type *> Types;
+  ComputeValueTypes(DL, Ty, Types, Offsets, StartingOffset);
+  for (Type *Ty : Types) {
+    ValueVTs.push_back(TLI.getValueType(DL, Ty));
+    if (MemVTs)
+      MemVTs->push_back(TLI.getMemValueType(DL, Ty));
+  }
+}
+
 void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
                            Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
                            SmallVectorImpl<EVT> *MemVTs,
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index c72b6e8..23a3543 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3657,6 +3657,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV,
     break; // Error
   }
 
+  case Instruction::PtrToAddr:
   case Instruction::PtrToInt: {
     const DataLayout &DL = getDataLayout();
 
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 3f3d5dc9..278dd65 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1915,7 +1915,6 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 
   // TODO: the "order" argument type is "int", not int32. So
   // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
-  ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
   assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
   Constant *OrderingVal =
       ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
@@ -2012,7 +2011,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
   if (CASExpected) {
     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
     AllocaCASExpected->setAlignment(AllocaAlignment);
-    Builder.CreateLifetimeStart(AllocaCASExpected, SizeVal64);
+    Builder.CreateLifetimeStart(AllocaCASExpected);
     Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
     Args.push_back(AllocaCASExpected);
   }
@@ -2026,7 +2025,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
     } else {
       AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
       AllocaValue->setAlignment(AllocaAlignment);
-      Builder.CreateLifetimeStart(AllocaValue, SizeVal64);
+      Builder.CreateLifetimeStart(AllocaValue);
       Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
       Args.push_back(AllocaValue);
     }
@@ -2036,7 +2035,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
   if (!CASExpected && HasResult && !UseSizedLibcall) {
     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
     AllocaResult->setAlignment(AllocaAlignment);
-    Builder.CreateLifetimeStart(AllocaResult, SizeVal64);
+    Builder.CreateLifetimeStart(AllocaResult);
     Args.push_back(AllocaResult);
   }
 
@@ -2069,7 +2068,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 
   // And then, extract the results...
   if (ValueOperand && !UseSizedLibcall)
-    Builder.CreateLifetimeEnd(AllocaValue, SizeVal64);
+    Builder.CreateLifetimeEnd(AllocaValue);
 
   if (CASExpected) {
     // The final result from the CAS is {load of 'expected' alloca, bool result
@@ -2078,7 +2077,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
     Value *V = PoisonValue::get(FinalResultTy);
     Value *ExpectedOut = Builder.CreateAlignedLoad(
         CASExpected->getType(), AllocaCASExpected, AllocaAlignment);
-    Builder.CreateLifetimeEnd(AllocaCASExpected, SizeVal64);
+    Builder.CreateLifetimeEnd(AllocaCASExpected);
     V = Builder.CreateInsertValue(V, ExpectedOut, 0);
     V = Builder.CreateInsertValue(V, Result, 1);
     I->replaceAllUsesWith(V);
@@ -2089,7 +2088,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
     else {
       V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
                                     AllocaAlignment);
-      Builder.CreateLifetimeEnd(AllocaResult, SizeVal64);
+      Builder.CreateLifetimeEnd(AllocaResult);
     }
     I->replaceAllUsesWith(V);
   }
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index dcfd9aa..7292bc2 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1787,10 +1787,18 @@ ReoptimizeBlock:
       // below were performed for EH "FallThrough" blocks.  Therefore, even if
       // that appears not to be happening anymore, we should assume that it is
       // possible and not remove the "!FallThrough()->isEHPad" condition below.
+      //
+      // Similarly, the analyzeBranch call does not consider callbr, which also
+      // introduces the possibility of infinite rotation, as there may be
+      // multiple successors of PrevBB. Thus we check such case by
+      // FallThrough->isInlineAsmBrIndirectTarget().
+      // NOTE: Checking if PrevBB contains callbr is more precise, but much
+      // more expensive.
       MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
       SmallVector<MachineOperand, 4> PrevCond;
-      if (FallThrough != MF.end() &&
-          !FallThrough->isEHPad() &&
+
+      if (FallThrough != MF.end() && !FallThrough->isEHPad() &&
+          !FallThrough->isInlineAsmBrIndirectTarget() &&
           !TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
           PrevBB.isSuccessor(&*FallThrough)) {
         MBB->moveAfter(&MF.back());
diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp
index b71e781..df34331 100644
--- a/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -89,7 +89,7 @@ CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ArgVT = Ins[i].VT;
     ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this))
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, Ins[i].OrigTy, *this))
       report_fatal_error("unable to allocate function argument #" + Twine(i));
   }
 }
@@ -102,7 +102,7 @@ bool CCState::CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, Outs[i].OrigTy, *this))
       return false;
   }
   return true;
@@ -116,7 +116,7 @@ void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, Outs[i].OrigTy, *this))
       report_fatal_error("unable to allocate function return #" + Twine(i));
   }
 }
@@ -129,7 +129,8 @@ void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
   for (unsigned i = 0; i != NumOps; ++i) {
     MVT ArgVT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, Outs[i].OrigTy,
+           *this)) {
 #ifndef NDEBUG
       dbgs() << "Call operand #" << i << " has unhandled type "
              << ArgVT << '\n';
@@ -142,12 +143,13 @@ void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
 /// Same as above except it takes vectors of types and argument flags.
 void CCState::AnalyzeCallOperands(SmallVectorImpl<MVT> &ArgVTs,
                                   SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                  SmallVectorImpl<Type *> &OrigTys,
                                   CCAssignFn Fn) {
   unsigned NumOps = ArgVTs.size();
   for (unsigned i = 0; i != NumOps; ++i) {
     MVT ArgVT = ArgVTs[i];
     ISD::ArgFlagsTy ArgFlags = Flags[i];
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, OrigTys[i], *this)) {
 #ifndef NDEBUG
       dbgs() << "Call operand #" << i << " has unhandled type "
              << ArgVT << '\n';
@@ -164,7 +166,7 @@ void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
     MVT VT = Ins[i].VT;
     ISD::ArgFlagsTy Flags = Ins[i].Flags;
-    if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) {
+    if (Fn(i, VT, VT, CCValAssign::Full, Flags, Ins[i].OrigTy, *this)) {
 #ifndef NDEBUG
       dbgs() << "Call result #" << i << " has unhandled type "
              << VT << '\n';
@@ -175,8 +177,8 @@ void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
 }
 
 /// Same as above except it's specialized for calls that produce a single value.
-void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
-  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) {
+void CCState::AnalyzeCallResult(MVT VT, Type *OrigTy, CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), OrigTy, *this)) {
 #ifndef NDEBUG
     dbgs() << "Call result has unhandled type "
            << VT << '\n';
@@ -213,7 +215,8 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCRegister> &Regs,
   // location in memory.
   bool HaveRegParm;
   do {
-    if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) {
+    Type *OrigTy = EVT(VT).getTypeForEVT(Context);
+    if (Fn(0, VT, VT, CCValAssign::Full, Flags, OrigTy, *this)) {
 #ifndef NDEBUG
       dbgs() << "Call has unhandled type " << VT
              << " while computing remaining regparms\n";
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9223739..0e40a92 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -377,7 +377,7 @@ public:
   /// to be optimized again.
   /// Note: Consider building time in this pass, when a BB updated, we need
   /// to insert such BB into FreshBBs for huge function.
-  SmallSet<BasicBlock *, 32> FreshBBs;
+  SmallPtrSet<BasicBlock *, 32> FreshBBs;
 
   void releaseMemory() {
     // Clear per function information.
@@ -1105,7 +1105,7 @@ bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
 
 /// Replace all old uses with new ones, and push the updated BBs into FreshBBs.
 static void replaceAllUsesWith(Value *Old, Value *New,
-                               SmallSet<BasicBlock *, 32> &FreshBBs,
+                               SmallPtrSet<BasicBlock *, 32> &FreshBBs,
                                bool IsHuge) {
   auto *OldI = dyn_cast<Instruction>(Old);
   if (OldI) {
@@ -2135,7 +2135,7 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
 //    Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
 static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
                                     const LoopInfo *LI,
-                                    SmallSet<BasicBlock *, 32> &FreshBBs,
+                                    SmallPtrSet<BasicBlock *, 32> &FreshBBs,
                                     bool IsHuge) {
   Value *AddOffset, *RemAmt, *AddInst;
   PHINode *LoopIncrPN;
@@ -2534,11 +2534,10 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
 ///     %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
 ///
 /// If the transform is performed, return true and set ModifiedDT to true.
-static bool despeculateCountZeros(IntrinsicInst *CountZeros,
-                                  LoopInfo &LI,
+static bool despeculateCountZeros(IntrinsicInst *CountZeros, LoopInfo &LI,
                                   const TargetLowering *TLI,
                                   const DataLayout *DL, ModifyDT &ModifiedDT,
-                                  SmallSet<BasicBlock *, 32> &FreshBBs,
+                                  SmallPtrSet<BasicBlock *, 32> &FreshBBs,
                                   bool IsHugeFunc) {
   // If a zero input is undefined, it doesn't make sense to despeculate that.
   if (match(CountZeros->getOperand(1), m_One()))
@@ -4351,7 +4350,7 @@ private:
                     PhiNodeSet &PhiNodesToMatch) {
     SmallVector<PHIPair, 8> WorkList;
     Matcher.insert({PHI, Candidate});
-    SmallSet<PHINode *, 8> MatchedPHIs;
+    SmallPtrSet<PHINode *, 8> MatchedPHIs;
     MatchedPHIs.insert(PHI);
     WorkList.push_back({PHI, Candidate});
     SmallSet<PHIPair, 8> Visited;
@@ -8635,7 +8634,7 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
 }
 
 static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
-                           SmallSet<BasicBlock *, 32> &FreshBBs,
+                           SmallPtrSet<BasicBlock *, 32> &FreshBBs,
                            bool IsHugeFunc) {
   // Try and convert
   //  %c = icmp ult %x, 8
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 9b2851e..cd21e25 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -67,6 +67,7 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
@@ -108,6 +109,42 @@ static bool isNeg(Value *V);
 static Value *getNegOperand(Value *V);
 
 namespace {
+struct ComplexValue {
+  Value *Real = nullptr;
+  Value *Imag = nullptr;
+
+  bool operator==(const ComplexValue &Other) const {
+    return Real == Other.Real && Imag == Other.Imag;
+  }
+};
+hash_code hash_value(const ComplexValue &Arg) {
+  return hash_combine(DenseMapInfo<Value *>::getHashValue(Arg.Real),
+                      DenseMapInfo<Value *>::getHashValue(Arg.Imag));
+}
+} // end namespace
+typedef SmallVector<struct ComplexValue, 2> ComplexValues;
+
+namespace llvm {
+template <> struct DenseMapInfo<ComplexValue> {
+  static inline ComplexValue getEmptyKey() {
+    return {DenseMapInfo<Value *>::getEmptyKey(),
+            DenseMapInfo<Value *>::getEmptyKey()};
+  }
+  static inline ComplexValue getTombstoneKey() {
+    return {DenseMapInfo<Value *>::getTombstoneKey(),
+            DenseMapInfo<Value *>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const ComplexValue &Val) {
+    return hash_combine(DenseMapInfo<Value *>::getHashValue(Val.Real),
+                        DenseMapInfo<Value *>::getHashValue(Val.Imag));
+  }
+  static bool isEqual(const ComplexValue &LHS, const ComplexValue &RHS) {
+    return LHS.Real == RHS.Real && LHS.Imag == RHS.Imag;
+  }
+};
+} // end namespace llvm
+
+namespace {
 template <typename T, typename IterT>
 std::optional<T> findCommonBetweenCollections(IterT A, IterT B) {
   auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); });
@@ -145,7 +182,13 @@ struct ComplexDeinterleavingCompositeNode {
 
   ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op,
                                      Value *R, Value *I)
-      : Operation(Op), Real(R), Imag(I) {}
+      : Operation(Op) {
+    Vals.push_back({R, I});
+  }
+
+  ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op,
+                                     ComplexValues &Other)
+      : Operation(Op), Vals(Other) {}
 
 private:
   friend class ComplexDeinterleavingGraph;
@@ -155,8 +198,7 @@ private:
 
 public:
   ComplexDeinterleavingOperation Operation;
-  Value *Real;
-  Value *Imag;
+  ComplexValues Vals;
 
   // This two members are required exclusively for generating
   // ComplexDeinterleavingOperation::Symmetric operations.
@@ -192,10 +234,12 @@ public:
     };
 
     OS << "- CompositeNode: " << this << "\n";
-    OS << "  Real: ";
-    PrintValue(Real);
-    OS << "  Imag: ";
-    PrintValue(Imag);
+    for (unsigned I = 0; I < Vals.size(); I++) {
+      OS << "  Real(" << I << ") : ";
+      PrintValue(Vals[I].Real);
+      OS << "  Imag(" << I << ") : ";
+      PrintValue(Vals[I].Imag);
+    }
     OS << "  ReplacementNode: ";
     PrintValue(ReplacementNode);
     OS << "  Operation: " << (int)Operation << "\n";
@@ -233,14 +277,16 @@ public:
   };
 
   explicit ComplexDeinterleavingGraph(const TargetLowering *TL,
-                                      const TargetLibraryInfo *TLI)
-      : TL(TL), TLI(TLI) {}
+                                      const TargetLibraryInfo *TLI,
+                                      unsigned Factor)
+      : TL(TL), TLI(TLI), Factor(Factor) {}
 
 private:
   const TargetLowering *TL = nullptr;
   const TargetLibraryInfo *TLI = nullptr;
+  unsigned Factor;
   SmallVector<NodePtr> CompositeNodes;
-  DenseMap<std::pair<Value *, Value *>, NodePtr> CachedResult;
+  DenseMap<ComplexValues, NodePtr> CachedResult;
 
   SmallPtrSet<Instruction *, 16> FinalInstructions;
 
@@ -305,10 +351,25 @@ private:
                                                                 I);
   }
 
+  NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation,
+                               ComplexValues &Vals) {
+#ifndef NDEBUG
+    for (auto &V : Vals) {
+      assert(
+          ((Operation != ComplexDeinterleavingOperation::ReductionPHI &&
+            Operation != ComplexDeinterleavingOperation::ReductionOperation) ||
+           (V.Real && V.Imag)) &&
+          "Reduction related nodes must have Real and Imaginary parts");
+    }
+#endif
+    return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation,
+                                                                Vals);
+  }
+
   NodePtr submitCompositeNode(NodePtr Node) {
     CompositeNodes.push_back(Node);
-    if (Node->Real)
-      CachedResult[{Node->Real, Node->Imag}] = Node;
+    if (Node->Vals[0].Real)
+      CachedResult[Node->Vals] = Node;
     return Node;
   }
 
@@ -340,11 +401,17 @@ private:
   /// 270: r: ar + bi
   ///      i: ai - br
   NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
-  NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
+  NodePtr identifySymmetricOperation(ComplexValues &Vals);
   NodePtr identifyPartialReduction(Value *R, Value *I);
   NodePtr identifyDotProduct(Value *Inst);
 
-  NodePtr identifyNode(Value *R, Value *I);
+  NodePtr identifyNode(ComplexValues &Vals);
+
+  NodePtr identifyNode(Value *R, Value *I) {
+    ComplexValues Vals;
+    Vals.push_back({R, I});
+    return identifyNode(Vals);
+  }
 
   /// Determine if a sum of complex numbers can be formed from \p RealAddends
   /// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
@@ -388,15 +455,16 @@ private:
   /// operation:
   /// * Using two shufflevectors with even indices for /pReal instruction and
   /// odd indices for /pImag instructions (only for fixed-width vectors)
-  /// * Using two extractvalue instructions applied to `vector.deinterleave2`
-  /// intrinsic (for both fixed and scalable vectors)
-  NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag);
+  /// * Using N extractvalue instructions applied to `vector.deinterleaveN`
+  /// intrinsics (for both fixed and scalable vectors) where N is a multiple of
+  /// 2.
+  NodePtr identifyDeinterleave(ComplexValues &Vals);
 
   /// identifying the operation that represents a complex number repeated in a
   /// Splat vector. There are two possible types of splats: ConstantExpr with
   /// the opcode ShuffleVector and ShuffleVectorInstr. Both should have an
   /// initialization mask with all values set to zero.
-  NodePtr identifySplat(Value *Real, Value *Imag);
+  NodePtr identifySplat(ComplexValues &Vals);
 
   NodePtr identifyPHINode(Instruction *Real, Instruction *Imag);
 
@@ -447,7 +515,7 @@ public:
   bool runOnFunction(Function &F);
 
 private:
-  bool evaluateBasicBlock(BasicBlock *B);
+  bool evaluateBasicBlock(BasicBlock *B, unsigned Factor);
 
   const TargetLowering *TL = nullptr;
   const TargetLibraryInfo *TLI = nullptr;
@@ -500,7 +568,15 @@ bool ComplexDeinterleaving::runOnFunction(Function &F) {
 
   bool Changed = false;
   for (auto &B : F)
-    Changed |= evaluateBasicBlock(&B);
+    Changed |= evaluateBasicBlock(&B, 2);
+
+  // TODO: Permit changes for both interleave factors in the same function.
+  if (!Changed) {
+    for (auto &B : F)
+      Changed |= evaluateBasicBlock(&B, 4);
+  }
+
+  // TODO: We can also support interleave factors of 6 and 8 if needed.
 
   return Changed;
 }
@@ -545,8 +621,8 @@ Value *getNegOperand(Value *V) {
   return I->getOperand(1);
 }
 
-bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) {
-  ComplexDeinterleavingGraph Graph(TL, TLI);
+bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B, unsigned Factor) {
+  ComplexDeinterleavingGraph Graph(TL, TLI, Factor);
   if (Graph.collectPotentialReductions(B))
     Graph.identifyReductionNodes();
 
@@ -669,6 +745,7 @@ ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real,
                                                Instruction *Imag) {
   LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag
                     << "\n");
+
   // Determine rotation
   auto IsAdd = [](unsigned Op) {
     return Op == Instruction::FAdd || Op == Instruction::Add;
@@ -865,43 +942,57 @@ static bool isInstructionPotentiallySymmetric(Instruction *I) {
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
-                                                       Instruction *Imag) {
-  if (Real->getOpcode() != Imag->getOpcode())
-    return nullptr;
+ComplexDeinterleavingGraph::identifySymmetricOperation(ComplexValues &Vals) {
+  auto *FirstReal = cast<Instruction>(Vals[0].Real);
+  unsigned FirstOpc = FirstReal->getOpcode();
+  for (auto &V : Vals) {
+    auto *Real = cast<Instruction>(V.Real);
+    auto *Imag = cast<Instruction>(V.Imag);
+    if (Real->getOpcode() != FirstOpc || Imag->getOpcode() != FirstOpc)
+      return nullptr;
 
-  if (!isInstructionPotentiallySymmetric(Real) ||
-      !isInstructionPotentiallySymmetric(Imag))
-    return nullptr;
+    if (!isInstructionPotentiallySymmetric(Real) ||
+        !isInstructionPotentiallySymmetric(Imag))
+      return nullptr;
 
-  auto *R0 = Real->getOperand(0);
-  auto *I0 = Imag->getOperand(0);
+    if (isa<FPMathOperator>(FirstReal))
+      if (Real->getFastMathFlags() != FirstReal->getFastMathFlags() ||
+          Imag->getFastMathFlags() != FirstReal->getFastMathFlags())
+        return nullptr;
+  }
 
-  NodePtr Op0 = identifyNode(R0, I0);
+  ComplexValues OpVals;
+  for (auto &V : Vals) {
+    auto *R0 = cast<Instruction>(V.Real)->getOperand(0);
+    auto *I0 = cast<Instruction>(V.Imag)->getOperand(0);
+    OpVals.push_back({R0, I0});
+  }
+
+  NodePtr Op0 = identifyNode(OpVals);
   NodePtr Op1 = nullptr;
   if (Op0 == nullptr)
     return nullptr;
 
-  if (Real->isBinaryOp()) {
-    auto *R1 = Real->getOperand(1);
-    auto *I1 = Imag->getOperand(1);
-    Op1 = identifyNode(R1, I1);
+  if (FirstReal->isBinaryOp()) {
+    OpVals.clear();
+    for (auto &V : Vals) {
+      auto *R1 = cast<Instruction>(V.Real)->getOperand(1);
+      auto *I1 = cast<Instruction>(V.Imag)->getOperand(1);
+      OpVals.push_back({R1, I1});
+    }
+    Op1 = identifyNode(OpVals);
     if (Op1 == nullptr)
       return nullptr;
   }
 
-  if (isa<FPMathOperator>(Real) &&
-      Real->getFastMathFlags() != Imag->getFastMathFlags())
-    return nullptr;
-
-  auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric,
-                                   Real, Imag);
-  Node->Opcode = Real->getOpcode();
-  if (isa<FPMathOperator>(Real))
-    Node->Flags = Real->getFastMathFlags();
+  auto Node =
+      prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, Vals);
+  Node->Opcode = FirstReal->getOpcode();
+  if (isa<FPMathOperator>(FirstReal))
+    Node->Flags = FirstReal->getFastMathFlags();
 
   Node->addOperand(Op0);
-  if (Real->isBinaryOp())
+  if (FirstReal->isBinaryOp())
     Node->addOperand(Op1);
 
   return submitCompositeNode(Node);
@@ -909,7 +1000,6 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
 
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
-
   if (!TL->isComplexDeinterleavingOperationSupported(
           ComplexDeinterleavingOperation::CDot, V->getType())) {
     LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving "
@@ -1054,65 +1144,77 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
-  auto It = CachedResult.find({R, I});
+ComplexDeinterleavingGraph::identifyNode(ComplexValues &Vals) {
+  auto It = CachedResult.find(Vals);
   if (It != CachedResult.end()) {
     LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
     return It->second;
   }
 
-  if (NodePtr CN = identifyPartialReduction(R, I))
-    return CN;
-
-  bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
-  if (!IsReduction && R->getType() != I->getType())
-    return nullptr;
+  if (Vals.size() == 1) {
+    assert(Factor == 2 && "Can only handle interleave factors of 2");
+    Value *R = Vals[0].Real;
+    Value *I = Vals[0].Imag;
+    if (NodePtr CN = identifyPartialReduction(R, I))
+      return CN;
+    bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
+    if (!IsReduction && R->getType() != I->getType())
+      return nullptr;
+  }
 
-  if (NodePtr CN = identifySplat(R, I))
+  if (NodePtr CN = identifySplat(Vals))
     return CN;
 
-  auto *Real = dyn_cast<Instruction>(R);
-  auto *Imag = dyn_cast<Instruction>(I);
-  if (!Real || !Imag)
-    return nullptr;
+  for (auto &V : Vals) {
+    auto *Real = dyn_cast<Instruction>(V.Real);
+    auto *Imag = dyn_cast<Instruction>(V.Imag);
+    if (!Real || !Imag)
+      return nullptr;
+  }
 
-  if (NodePtr CN = identifyDeinterleave(Real, Imag))
+  if (NodePtr CN = identifyDeinterleave(Vals))
     return CN;
 
-  if (NodePtr CN = identifyPHINode(Real, Imag))
-    return CN;
+  if (Vals.size() == 1) {
+    assert(Factor == 2 && "Can only handle interleave factors of 2");
+    auto *Real = dyn_cast<Instruction>(Vals[0].Real);
+    auto *Imag = dyn_cast<Instruction>(Vals[0].Imag);
+    if (NodePtr CN = identifyPHINode(Real, Imag))
+      return CN;
 
-  if (NodePtr CN = identifySelectNode(Real, Imag))
-    return CN;
+    if (NodePtr CN = identifySelectNode(Real, Imag))
+      return CN;
 
-  auto *VTy = cast<VectorType>(Real->getType());
-  auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+    auto *VTy = cast<VectorType>(Real->getType());
+    auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
 
-  bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported(
-      ComplexDeinterleavingOperation::CMulPartial, NewVTy);
-  bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported(
-      ComplexDeinterleavingOperation::CAdd, NewVTy);
+    bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported(
+        ComplexDeinterleavingOperation::CMulPartial, NewVTy);
+    bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported(
+        ComplexDeinterleavingOperation::CAdd, NewVTy);
 
-  if (HasCMulSupport && isInstructionPairMul(Real, Imag)) {
-    if (NodePtr CN = identifyPartialMul(Real, Imag))
-      return CN;
-  }
+    if (HasCMulSupport && isInstructionPairMul(Real, Imag)) {
+      if (NodePtr CN = identifyPartialMul(Real, Imag))
+        return CN;
+    }
 
-  if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) {
-    if (NodePtr CN = identifyAdd(Real, Imag))
-      return CN;
-  }
+    if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) {
+      if (NodePtr CN = identifyAdd(Real, Imag))
+        return CN;
+    }
 
-  if (HasCMulSupport && HasCAddSupport) {
-    if (NodePtr CN = identifyReassocNodes(Real, Imag))
-      return CN;
+    if (HasCMulSupport && HasCAddSupport) {
+      if (NodePtr CN = identifyReassocNodes(Real, Imag)) {
+        return CN;
+      }
+    }
   }
 
-  if (NodePtr CN = identifySymmetricOperation(Real, Imag))
+  if (NodePtr CN = identifySymmetricOperation(Vals))
     return CN;
 
   LLVM_DEBUG(dbgs() << "  - Not recognised as a valid pattern.\n");
-  CachedResult[{R, I}] = nullptr;
+  CachedResult[Vals] = nullptr;
   return nullptr;
 }
 
@@ -1256,9 +1358,10 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
       return nullptr;
   }
   assert(FinalNode && "FinalNode can not be nullptr here");
+  assert(FinalNode->Vals.size() == 1);
   // Set the Real and Imag fields of the final node and submit it
-  FinalNode->Real = Real;
-  FinalNode->Imag = Imag;
+  FinalNode->Vals[0].Real = Real;
+  FinalNode->Vals[0].Imag = Imag;
   submitCompositeNode(FinalNode);
   return FinalNode;
 }
@@ -1381,7 +1484,7 @@ ComplexDeinterleavingGraph::identifyMultiplications(
 
     auto NodeA = It->second;
     auto NodeB = PMI.Node;
-    auto IsMultiplicandReal = PMI.Common == NodeA->Real;
+    auto IsMultiplicandReal = PMI.Common == NodeA->Vals[0].Real;
     // The following table illustrates the relationship between multiplications
     // and rotations. If we consider the multiplication (X + iY) * (U + iV), we
     // can see:
@@ -1423,10 +1526,10 @@ ComplexDeinterleavingGraph::identifyMultiplications(
 
     LLVM_DEBUG({
       dbgs() << "Identified partial multiplication (X, Y) * (U, V):\n";
-      dbgs().indent(4) << "X: " << *NodeA->Real << "\n";
-      dbgs().indent(4) << "Y: " << *NodeA->Imag << "\n";
-      dbgs().indent(4) << "U: " << *NodeB->Real << "\n";
-      dbgs().indent(4) << "V: " << *NodeB->Imag << "\n";
+      dbgs().indent(4) << "X: " << *NodeA->Vals[0].Real << "\n";
+      dbgs().indent(4) << "Y: " << *NodeA->Vals[0].Imag << "\n";
+      dbgs().indent(4) << "U: " << *NodeB->Vals[0].Real << "\n";
+      dbgs().indent(4) << "V: " << *NodeB->Vals[0].Imag << "\n";
       dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n";
     });
 
@@ -1595,10 +1698,13 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
                ComplexDeinterleavingOperation::ReductionOperation ||
            RootNode->Operation ==
                ComplexDeinterleavingOperation::ReductionSingle);
+    assert(RootNode->Vals.size() == 1 &&
+           "Cannot handle reductions involving multiple complex values");
     // Find out which part, Real or Imag, comes later, and only if we come to
     // the latest part, add it to OrderedRoots.
-    auto *R = cast<Instruction>(RootNode->Real);
-    auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
+    auto *R = cast<Instruction>(RootNode->Vals[0].Real);
+    auto *I = RootNode->Vals[0].Imag ? cast<Instruction>(RootNode->Vals[0].Imag)
+                                     : nullptr;
 
     Instruction *ReplacementAnchor;
     if (I)
@@ -1631,6 +1737,8 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
 
 bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) {
   bool FoundPotentialReduction = false;
+  if (Factor != 2)
+    return false;
 
   auto *Br = dyn_cast<BranchInst>(B->getTerminator());
   if (!Br || Br->getNumSuccessors() != 2)
@@ -1682,6 +1790,8 @@ bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) {
 }
 
 void ComplexDeinterleavingGraph::identifyReductionNodes() {
+  assert(Factor == 2 && "Cannot handle multiple complex values");
+
   SmallVector<bool> Processed(ReductionInfo.size(), false);
   SmallVector<Instruction *> OperationInstruction;
   for (auto &P : ReductionInfo)
@@ -1771,11 +1881,11 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
 }
 
 bool ComplexDeinterleavingGraph::checkNodes() {
-
   bool FoundDeinterleaveNode = false;
   for (NodePtr N : CompositeNodes) {
     if (!N->areOperandsValid())
       return false;
+
     if (N->Operation == ComplexDeinterleavingOperation::Deinterleave)
       FoundDeinterleaveNode = true;
   }
@@ -1861,17 +1971,33 @@ bool ComplexDeinterleavingGraph::checkNodes() {
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
   if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) {
-    if (Intrinsic->getIntrinsicID() != Intrinsic::vector_interleave2)
+    if (Intrinsic::getInterleaveIntrinsicID(Factor) !=
+        Intrinsic->getIntrinsicID())
       return nullptr;
 
-    auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0));
-    auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(1));
-    if (!Real || !Imag)
-      return nullptr;
+    ComplexValues Vals;
+    for (unsigned I = 0; I < Factor; I += 2) {
+      auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(I));
+      auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(I + 1));
+      if (!Real || !Imag)
+        return nullptr;
+      Vals.push_back({Real, Imag});
+    }
 
-    return identifyNode(Real, Imag);
+    ComplexDeinterleavingGraph::NodePtr Node1 = identifyNode(Vals);
+    if (!Node1)
+      return nullptr;
+    return Node1;
   }
 
+  // TODO: We could also add support for fixed-width interleave factors of 4
+  // and above, but currently for symmetric operations the interleaves and
+  // deinterleaves are already removed by VectorCombine. If we extend this to
+  // permit complex multiplications, reductions, etc. then we should also add
+  // support for fixed-width here.
+  if (Factor != 2)
+    return nullptr;
+
   auto *SVI = dyn_cast<ShuffleVectorInst>(RootI);
   if (!SVI)
     return nullptr;
@@ -1890,22 +2016,52 @@ ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
-                                                 Instruction *Imag) {
-  Instruction *I = nullptr;
-  Value *FinalValue = nullptr;
-  if (match(Real, m_ExtractValue<0>(m_Instruction(I))) &&
-      match(Imag, m_ExtractValue<1>(m_Specific(I))) &&
-      match(I, m_Intrinsic<Intrinsic::vector_deinterleave2>(
-                   m_Value(FinalValue)))) {
+ComplexDeinterleavingGraph::identifyDeinterleave(ComplexValues &Vals) {
+  Instruction *II = nullptr;
+
+  // Must be at least one complex value.
+  auto CheckExtract = [&](Value *V, unsigned ExpectedIdx,
+                          Instruction *ExpectedInsn) -> ExtractValueInst * {
+    auto *EVI = dyn_cast<ExtractValueInst>(V);
+    if (!EVI || EVI->getNumIndices() != 1 ||
+        EVI->getIndices()[0] != ExpectedIdx ||
+        !isa<Instruction>(EVI->getAggregateOperand()) ||
+        (ExpectedInsn && ExpectedInsn != EVI->getAggregateOperand()))
+      return nullptr;
+    return EVI;
+  };
+
+  for (unsigned Idx = 0; Idx < Vals.size(); Idx++) {
+    ExtractValueInst *RealEVI = CheckExtract(Vals[Idx].Real, Idx * 2, II);
+    if (RealEVI && Idx == 0)
+      II = cast<Instruction>(RealEVI->getAggregateOperand());
+    if (!RealEVI || !CheckExtract(Vals[Idx].Imag, (Idx * 2) + 1, II)) {
+      II = nullptr;
+      break;
+    }
+  }
+
+  if (auto *IntrinsicII = dyn_cast_or_null<IntrinsicInst>(II)) {
+    if (IntrinsicII->getIntrinsicID() !=
+        Intrinsic::getDeinterleaveIntrinsicID(2 * Vals.size()))
+      return nullptr;
+
+    // The remaining should match too.
     NodePtr PlaceholderNode = prepareCompositeNode(
-        llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag);
-    PlaceholderNode->ReplacementNode = FinalValue;
-    FinalInstructions.insert(Real);
-    FinalInstructions.insert(Imag);
+        llvm::ComplexDeinterleavingOperation::Deinterleave, Vals);
+    PlaceholderNode->ReplacementNode = II->getOperand(0);
+    for (auto &V : Vals) {
+      FinalInstructions.insert(cast<Instruction>(V.Real));
+      FinalInstructions.insert(cast<Instruction>(V.Imag));
+    }
     return submitCompositeNode(PlaceholderNode);
   }
 
+  if (Vals.size() != 1)
+    return nullptr;
+
+  Value *Real = Vals[0].Real;
+  Value *Imag = Vals[0].Imag;
   auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real);
   auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag);
   if (!RealShuffle || !ImagShuffle) {
@@ -1999,7 +2155,7 @@ ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
+ComplexDeinterleavingGraph::identifySplat(ComplexValues &Vals) {
   auto IsSplat = [](Value *V) -> bool {
     // Fixed-width vector with constants
     if (isa<ConstantDataVector>(V))
@@ -2033,24 +2189,39 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
     return all_equal(Mask) && Mask[0] == 0;
   };
 
-  if (!IsSplat(R) || !IsSplat(I))
-    return nullptr;
-
-  auto *Real = dyn_cast<Instruction>(R);
-  auto *Imag = dyn_cast<Instruction>(I);
-  if ((!Real && Imag) || (Real && !Imag))
-    return nullptr;
+  // The splats must meet the following requirements:
+  //   1. Must either be all instructions or all values.
+  //   2. Non-constant splats must live in the same block.
+  if (auto *FirstValAsInstruction = dyn_cast<Instruction>(Vals[0].Real)) {
+    BasicBlock *FirstBB = FirstValAsInstruction->getParent();
+    for (auto &V : Vals) {
+      if (!IsSplat(V.Real) || !IsSplat(V.Imag))
+        return nullptr;
 
-  if (Real && Imag) {
-    // Non-constant splats should be in the same basic block
-    if (Real->getParent() != Imag->getParent())
-      return nullptr;
+      auto *Real = dyn_cast<Instruction>(V.Real);
+      auto *Imag = dyn_cast<Instruction>(V.Imag);
+      if (!Real || !Imag || Real->getParent() != FirstBB ||
+          Imag->getParent() != FirstBB)
+        return nullptr;
+    }
+  } else {
+    for (auto &V : Vals) {
+      if (!IsSplat(V.Real) || !IsSplat(V.Imag) || isa<Instruction>(V.Real) ||
+          isa<Instruction>(V.Imag))
+        return nullptr;
+    }
+  }
 
-    FinalInstructions.insert(Real);
-    FinalInstructions.insert(Imag);
+  for (auto &V : Vals) {
+    auto *Real = dyn_cast<Instruction>(V.Real);
+    auto *Imag = dyn_cast<Instruction>(V.Imag);
+    if (Real && Imag) {
+      FinalInstructions.insert(Real);
+      FinalInstructions.insert(Imag);
+    }
   }
   NodePtr PlaceholderNode =
-      prepareCompositeNode(ComplexDeinterleavingOperation::Splat, R, I);
+      prepareCompositeNode(ComplexDeinterleavingOperation::Splat, Vals);
   return submitCompositeNode(PlaceholderNode);
 }
 
@@ -2186,24 +2357,35 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     llvm_unreachable("Deinterleave node should already have ReplacementNode");
     break;
   case ComplexDeinterleavingOperation::Splat: {
-    auto *R = dyn_cast<Instruction>(Node->Real);
-    auto *I = dyn_cast<Instruction>(Node->Imag);
+    SmallVector<Value *> Ops;
+    for (auto &V : Node->Vals) {
+      Ops.push_back(V.Real);
+      Ops.push_back(V.Imag);
+    }
+    auto *R = dyn_cast<Instruction>(Node->Vals[0].Real);
+    auto *I = dyn_cast<Instruction>(Node->Vals[0].Imag);
     if (R && I) {
       // Splats that are not constant are interleaved where they are located
-      Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
+      Instruction *InsertPoint = R;
+      for (auto V : Node->Vals) {
+        if (InsertPoint->comesBefore(cast<Instruction>(V.Real)))
+          InsertPoint = cast<Instruction>(V.Real);
+        if (InsertPoint->comesBefore(cast<Instruction>(V.Imag)))
+          InsertPoint = cast<Instruction>(V.Imag);
+      }
+      InsertPoint = InsertPoint->getNextNode();
       IRBuilder<> IRB(InsertPoint);
-      ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag});
+      ReplacementNode = IRB.CreateVectorInterleave(Ops);
     } else {
-      ReplacementNode =
-          Builder.CreateVectorInterleave({Node->Real, Node->Imag});
+      ReplacementNode = Builder.CreateVectorInterleave(Ops);
     }
     break;
   }
   case ComplexDeinterleavingOperation::ReductionPHI: {
     // If Operation is ReductionPHI, a new empty PHINode is created.
     // It is filled later when the ReductionOperation is processed.
-    auto *OldPHI = cast<PHINode>(Node->Real);
-    auto *VTy = cast<VectorType>(Node->Real->getType());
+    auto *OldPHI = cast<PHINode>(Node->Vals[0].Real);
+    auto *VTy = cast<VectorType>(Node->Vals[0].Real->getType());
     auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
     auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt());
     OldToNewPHI[OldPHI] = NewPHI;
@@ -2219,8 +2401,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     processReductionOperation(ReplacementNode, Node);
     break;
   case ComplexDeinterleavingOperation::ReductionSelect: {
-    auto *MaskReal = cast<Instruction>(Node->Real)->getOperand(0);
-    auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0);
+    auto *MaskReal = cast<Instruction>(Node->Vals[0].Real)->getOperand(0);
+    auto *MaskImag = cast<Instruction>(Node->Vals[0].Imag)->getOperand(0);
     auto *A = replaceNode(Builder, Node->Operands[0]);
     auto *B = replaceNode(Builder, Node->Operands[1]);
     auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag});
@@ -2237,7 +2419,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
 
 void ComplexDeinterleavingGraph::processReductionSingle(
     Value *OperationReplacement, RawNodePtr Node) {
-  auto *Real = cast<Instruction>(Node->Real);
+  auto *Real = cast<Instruction>(Node->Vals[0].Real);
   auto *OldPHI = ReductionInfo[Real].first;
   auto *NewPHI = OldToNewPHI[OldPHI];
   auto *VTy = cast<VectorType>(Real->getType());
@@ -2269,8 +2451,8 @@ void ComplexDeinterleavingGraph::processReductionSingle(
 
 void ComplexDeinterleavingGraph::processReductionOperation(
     Value *OperationReplacement, RawNodePtr Node) {
-  auto *Real = cast<Instruction>(Node->Real);
-  auto *Imag = cast<Instruction>(Node->Imag);
+  auto *Real = cast<Instruction>(Node->Vals[0].Real);
+  auto *Imag = cast<Instruction>(Node->Vals[0].Imag);
   auto *OldPHIReal = ReductionInfo[Real].first;
   auto *OldPHIImag = ReductionInfo[Imag].first;
   auto *NewPHI = OldToNewPHI[OldPHIReal];
@@ -2318,15 +2500,15 @@ void ComplexDeinterleavingGraph::replaceNodes() {
 
     if (RootNode->Operation ==
         ComplexDeinterleavingOperation::ReductionOperation) {
-      auto *RootReal = cast<Instruction>(RootNode->Real);
-      auto *RootImag = cast<Instruction>(RootNode->Imag);
+      auto *RootReal = cast<Instruction>(RootNode->Vals[0].Real);
+      auto *RootImag = cast<Instruction>(RootNode->Vals[0].Imag);
       ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
       ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
       DeadInstrRoots.push_back(RootReal);
       DeadInstrRoots.push_back(RootImag);
     } else if (RootNode->Operation ==
                ComplexDeinterleavingOperation::ReductionSingle) {
-      auto *RootInst = cast<Instruction>(RootNode->Real);
+      auto *RootInst = cast<Instruction>(RootNode->Vals[0].Real);
       auto &Info = ReductionInfo[RootInst];
       Info.first->removeIncomingValue(BackEdge);
       DeadInstrRoots.push_back(Info.second);
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index d8e3f5f..753c656 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -508,8 +508,7 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
     IRBuilder<> Builder(VPI.getParent(), VPI.getIterator());
     Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue());
     Value *VScale = Builder.CreateVScale(Int32Ty, "vscale");
-    MaxEVL = Builder.CreateMul(VScale, FactorConst, "scalable_size",
-                               /*NUW*/ true, /*NSW*/ false);
+    MaxEVL = Builder.CreateNUWMul(VScale, FactorConst, "scalable_size");
   } else {
     MaxEVL = ConstantInt::get(Int32Ty, StaticElemCount.getFixedValue(), false);
   }
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 0f3ec8b..90a18b86 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1099,7 +1099,7 @@ bool CallLowering::checkReturn(CCState &CCInfo,
                                CCAssignFn *Fn) const {
   for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
     MVT VT = MVT::getVT(Outs[I].Ty);
-    if (Fn(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo))
+    if (Fn(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], Outs[I].Ty, CCInfo))
       return false;
   }
   return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e84ba91..8163dea 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1821,10 +1821,29 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
       return false;
   }
 
+  // Reassociating nuw additions preserves nuw. If both original G_PTR_ADDs are
+  // inbounds, reaching the same result in one G_PTR_ADD is also inbounds.
+  // The nusw constraints are satisfied because imm1+imm2 cannot exceed the
+  // largest signed integer that fits into the index type, which is the maximum
+  // size of allocated objects according to the IR Language Reference.
+  unsigned PtrAddFlags = MI.getFlags();
+  unsigned LHSPtrAddFlags = Add2Def->getFlags();
+  bool IsNoUWrap = PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::NoUWrap;
+  bool IsInBounds =
+      PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::InBounds;
+  unsigned Flags = 0;
+  if (IsNoUWrap)
+    Flags |= MachineInstr::MIFlag::NoUWrap;
+  if (IsInBounds) {
+    Flags |= MachineInstr::MIFlag::InBounds;
+    Flags |= MachineInstr::MIFlag::NoUSWrap;
+  }
+
   // Pass the combined immediate to the apply function.
   MatchInfo.Imm = AMNew.BaseOffs;
   MatchInfo.Base = Base;
   MatchInfo.Bank = getRegBank(Imm2);
+  MatchInfo.Flags = Flags;
   return true;
 }
 
@@ -1838,6 +1857,7 @@ void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
   Observer.changingInstr(MI);
   MI.getOperand(1).setReg(MatchInfo.Base);
   MI.getOperand(2).setReg(NewOffset.getReg(0));
+  MI.setFlags(MatchInfo.Flags);
   Observer.changedInstr(MI);
 }
 
@@ -4871,14 +4891,34 @@ bool CombinerHelper::matchReassocConstantInnerRHS(GPtrAdd &MI,
   if (!C2)
     return false;
 
+  // If both additions are nuw, the reassociated additions are also nuw.
+  // If the original G_PTR_ADD is additionally nusw, X and C are both not
+  // negative, so BASE+X is between BASE and BASE+(X+C). The new G_PTR_ADDs are
+  // therefore also nusw.
+  // If the original G_PTR_ADD is additionally inbounds (which implies nusw),
+  // the new G_PTR_ADDs are then also inbounds.
+  unsigned PtrAddFlags = MI.getFlags();
+  unsigned AddFlags = RHS->getFlags();
+  bool IsNoUWrap = PtrAddFlags & AddFlags & MachineInstr::MIFlag::NoUWrap;
+  bool IsNoUSWrap = IsNoUWrap && (PtrAddFlags & MachineInstr::MIFlag::NoUSWrap);
+  bool IsInBounds = IsNoUWrap && (PtrAddFlags & MachineInstr::MIFlag::InBounds);
+  unsigned Flags = 0;
+  if (IsNoUWrap)
+    Flags |= MachineInstr::MIFlag::NoUWrap;
+  if (IsNoUSWrap)
+    Flags |= MachineInstr::MIFlag::NoUSWrap;
+  if (IsInBounds)
+    Flags |= MachineInstr::MIFlag::InBounds;
+
   MatchInfo = [=, &MI](MachineIRBuilder &B) {
     LLT PtrTy = MRI.getType(MI.getOperand(0).getReg());
 
     auto NewBase =
-        Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg());
+        Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg(), Flags);
     Observer.changingInstr(MI);
     MI.getOperand(1).setReg(NewBase.getReg(0));
     MI.getOperand(2).setReg(RHS->getOperand(2).getReg());
+    MI.setFlags(Flags);
     Observer.changedInstr(MI);
   };
   return !reassociationCanBreakAddressingModePattern(MI);
@@ -4897,6 +4937,25 @@ bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI,
     return false;
 
   auto *LHSPtrAdd = cast<GPtrAdd>(LHS);
+
+  // Reassociating nuw additions preserves nuw. If both original G_PTR_ADDs are
+  // nuw and inbounds (which implies nusw), the offsets are both non-negative,
+  // so the new G_PTR_ADDs are also inbounds.
+  unsigned PtrAddFlags = MI.getFlags();
+  unsigned LHSPtrAddFlags = LHSPtrAdd->getFlags();
+  bool IsNoUWrap = PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::NoUWrap;
+  bool IsNoUSWrap = IsNoUWrap && (PtrAddFlags & LHSPtrAddFlags &
+                                  MachineInstr::MIFlag::NoUSWrap);
+  bool IsInBounds = IsNoUWrap && (PtrAddFlags & LHSPtrAddFlags &
+                                  MachineInstr::MIFlag::InBounds);
+  unsigned Flags = 0;
+  if (IsNoUWrap)
+    Flags |= MachineInstr::MIFlag::NoUWrap;
+  if (IsNoUSWrap)
+    Flags |= MachineInstr::MIFlag::NoUSWrap;
+  if (IsInBounds)
+    Flags |= MachineInstr::MIFlag::InBounds;
+
   MatchInfo = [=, &MI](MachineIRBuilder &B) {
     // When we change LHSPtrAdd's offset register we might cause it to use a reg
     // before its def. Sink the instruction so the outer PTR_ADD to ensure this
@@ -4907,9 +4966,11 @@ bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI,
     auto NewCst = B.buildConstant(MRI.getType(RHSReg), LHSCstOff->Value);
     Observer.changingInstr(MI);
     MI.getOperand(2).setReg(NewCst.getReg(0));
+    MI.setFlags(Flags);
     Observer.changedInstr(MI);
     Observer.changingInstr(*LHSPtrAdd);
     LHSPtrAdd->getOperand(2).setReg(RHSReg);
+    LHSPtrAdd->setFlags(Flags);
     Observer.changedInstr(*LHSPtrAdd);
   };
   return !reassociationCanBreakAddressingModePattern(MI);
@@ -4933,11 +4994,30 @@ bool CombinerHelper::matchReassocFoldConstantsInSubTree(
   if (!C2)
     return false;
 
+  // Reassociating nuw additions preserves nuw. If both original G_PTR_ADDs are
+  // inbounds, reaching the same result in one G_PTR_ADD is also inbounds.
+  // The nusw constraints are satisfied because imm1+imm2 cannot exceed the
+  // largest signed integer that fits into the index type, which is the maximum
+  // size of allocated objects according to the IR Language Reference.
+  unsigned PtrAddFlags = MI.getFlags();
+  unsigned LHSPtrAddFlags = LHSPtrAdd->getFlags();
+  bool IsNoUWrap = PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::NoUWrap;
+  bool IsInBounds =
+      PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::InBounds;
+  unsigned Flags = 0;
+  if (IsNoUWrap)
+    Flags |= MachineInstr::MIFlag::NoUWrap;
+  if (IsInBounds) {
+    Flags |= MachineInstr::MIFlag::InBounds;
+    Flags |= MachineInstr::MIFlag::NoUSWrap;
+  }
+
   MatchInfo = [=, &MI](MachineIRBuilder &B) {
     auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2);
     Observer.changingInstr(MI);
     MI.getOperand(1).setReg(LHSSrc1);
     MI.getOperand(2).setReg(NewCst.getReg(0));
+    MI.setFlags(Flags);
     Observer.changedInstr(MI);
   };
   return !reassociationCanBreakAddressingModePattern(MI);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index bbfae57..8424a81 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2209,7 +2209,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START
                                                   : TargetOpcode::LIFETIME_END;
 
-    const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(1));
+    const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(0));
     if (!AI || !AI->isStaticAlloca())
       return true;
 
@@ -2522,6 +2522,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
       Opc = ID == Intrinsic::vector_reduce_fadd
                 ? TargetOpcode::G_VECREDUCE_SEQ_FADD
                 : TargetOpcode::G_VECREDUCE_SEQ_FMUL;
+      if (!MRI->getType(VecSrc).isVector())
+        Opc = ID == Intrinsic::vector_reduce_fadd ? TargetOpcode::G_FADD
+                                                  : TargetOpcode::G_FMUL;
       MIRBuilder.buildInstr(Opc, {Dst}, {ScalarSrc, VecSrc},
                             MachineInstr::copyFlagsFromInstruction(CI));
       return true;
@@ -2556,6 +2559,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                          getOrCreateVReg(*ConstantInt::getTrue(CI.getType())));
     return true;
   case Intrinsic::amdgcn_cs_chain:
+  case Intrinsic::amdgcn_call_whole_wave:
     return translateCallBase(CI, MIRBuilder);
   case Intrinsic::fptrunc_round: {
     uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);
@@ -2786,11 +2790,14 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (CI.isInlineAsm())
     return translateInlineAsm(CI, MIRBuilder);
 
-  diagnoseDontCall(CI);
-
   Intrinsic::ID ID = F ? F->getIntrinsicID() : Intrinsic::not_intrinsic;
-  if (!F || ID == Intrinsic::not_intrinsic)
-    return translateCallBase(CI, MIRBuilder);
+  if (!F || ID == Intrinsic::not_intrinsic) {
+    if (translateCallBase(CI, MIRBuilder)) {
+      diagnoseDontCall(CI);
+      return true;
+    }
+    return false;
+  }
 
   assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");
 
@@ -3513,7 +3520,7 @@ void IRTranslator::finishPendingPhis() {
     Verifier.setCurrentInst(PI);
 #endif // ifndef NDEBUG
 
-    SmallSet<const MachineBasicBlock *, 16> SeenPreds;
+    SmallPtrSet<const MachineBasicBlock *, 16> SeenPreds;
     for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) {
       auto IRPred = PI->getIncomingBlock(i);
       ArrayRef<Register> ValRegs = getOrCreateVRegs(*PI->getIncomingValue(i));
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d9d3569..008c188 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5574,12 +5574,19 @@ LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
 
   unsigned NewElemCount =
       NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
-  LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
-
-  // Split the Src and Dst Reg into smaller registers
   SmallVector<Register> SrcVRegs, BitcastVRegs;
-  if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy)
-    return UnableToLegalize;
+  if (NewElemCount == 1) {
+    LLT SrcNarrowTy = SrcTy.getElementType();
+
+    auto Unmerge = MIRBuilder.buildUnmerge(SrcNarrowTy, SrcReg);
+    getUnmergeResults(SrcVRegs, *Unmerge);
+  } else {
+    LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
+
+    // Split the Src and Dst Reg into smaller registers
+    if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy)
+      return UnableToLegalize;
+  }
 
   // Build new smaller bitcast instructions
   // Not supporting Leftover types for now but will have to
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 8955dd0..e41fd81 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -1869,8 +1869,10 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
   case TargetOpcode::G_FSHR:
   case TargetOpcode::G_SMAX:
   case TargetOpcode::G_SMIN:
+  case TargetOpcode::G_SCMP:
   case TargetOpcode::G_UMAX:
   case TargetOpcode::G_UMIN:
+  case TargetOpcode::G_UCMP:
   case TargetOpcode::G_PTRMASK:
   case TargetOpcode::G_SADDO:
   case TargetOpcode::G_SSUBO:
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 5e50898..93f6e39 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -268,13 +268,16 @@ static Value *getMaskOperand(IntrinsicInst *II) {
   }
 }
 
-// Return the corresponded deinterleaved mask, or nullptr if there is no valid
-// mask.
-static Value *getMask(Value *WideMask, unsigned Factor,
-                      ElementCount LeafValueEC);
-
-static Value *getMask(Value *WideMask, unsigned Factor,
-                      VectorType *LeafValueTy) {
+// Return a pair of
+//  (1) The corresponded deinterleaved mask, or nullptr if there is no valid
+//  mask.
+//  (2) Some mask effectively skips a certain field, and this element is a mask
+//  in which inactive lanes represent fields that are skipped (i.e. "gaps").
+static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
+                                         ElementCount LeafValueEC);
+
+static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
+                                         VectorType *LeafValueTy) {
   return getMask(WideMask, Factor, LeafValueTy->getElementCount());
 }
 
@@ -379,22 +382,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
   Value *Mask = nullptr;
+  auto GapMask = APInt::getAllOnes(Factor);
   if (LI) {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    Mask = getMask(getMaskOperand(II), Factor, VecTy);
+    std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, VecTy);
     if (!Mask)
       return false;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
                       << *Load << "\n");
+    LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
+                      << " and actual factor " << GapMask.popcount() << "\n");
   }
 
   // Try to create target specific intrinsics to replace the load and
   // shuffles.
   if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
-                                 Indices, Factor))
+                                 Indices, Factor, GapMask))
     // If Extracts is not empty, tryReplaceExtracts made changes earlier.
     return !Extracts.empty() || BinOpShuffleChanged;
 
@@ -536,10 +542,15 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
     unsigned LaneMaskLen = NumStoredElements / Factor;
-    Mask = getMask(getMaskOperand(II), Factor,
-                   ElementCount::getFixed(LaneMaskLen));
+    APInt GapMask(Factor, 0);
+    std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor,
+                                      ElementCount::getFixed(LaneMaskLen));
     if (!Mask)
       return false;
+    // We haven't supported gap mask for stores. Yet it is possible that we
+    // already changed the IR, hence returning true here.
+    if (GapMask.popcount() != Factor)
+      return true;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
                       << *Store << "\n");
@@ -556,34 +567,97 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   return true;
 }
 
-static Value *getMask(Value *WideMask, unsigned Factor,
-                      ElementCount LeafValueEC) {
+// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the
+// last field in a factor-of-three interleaved store or deinterleaved load (in
+// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask.
+// This helper function tries to detect this pattern and return the actual
+// factor we're accessing, which is 2 in this example.
+static void getGapMask(const Constant &MaskConst, unsigned Factor,
+                       unsigned LeafMaskLen, APInt &GapMask) {
+  assert(GapMask.getBitWidth() == Factor);
+  for (unsigned F = 0U; F < Factor; ++F) {
+    bool AllZero = true;
+    for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) {
+      Constant *C = MaskConst.getAggregateElement(F + Idx * Factor);
+      if (!C->isZeroValue()) {
+        AllZero = false;
+        break;
+      }
+    }
+    // All mask bits on this field are zero, skipping it.
+    if (AllZero)
+      GapMask.clearBit(F);
+  }
+}
+
+static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
+                                         ElementCount LeafValueEC) {
+  auto GapMask = APInt::getAllOnes(Factor);
+
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
     if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
-        F && F == Factor && llvm::all_equal(IMI->args())) {
-      return IMI->getArgOperand(0);
+        F && F == Factor) {
+      Value *RefArg = nullptr;
+      // Check if all the intrinsic arguments are the same, except those that
+      // are zeros, which we mark as gaps in the gap mask.
+      for (auto [Idx, Arg] : enumerate(IMI->args())) {
+        if (auto *C = dyn_cast<Constant>(Arg); C && C->isZeroValue()) {
+          GapMask.clearBit(Idx);
+          continue;
+        }
+
+        if (!RefArg)
+          RefArg = Arg;
+        else if (RefArg != Arg)
+          return {nullptr, GapMask};
+      }
+
+      // In a very rare occasion, all the intrinsic arguments might be zeros,
+      // in which case we still want to return an all-zeros constant instead of
+      // nullptr.
+      return {RefArg ? RefArg : IMI->getArgOperand(0), GapMask};
     }
   }
 
+  // Masks that are assembled from bitwise AND.
+  if (auto *AndOp = dyn_cast<BinaryOperator>(WideMask);
+      AndOp && AndOp->getOpcode() == Instruction::And) {
+    auto [MaskLHS, GapMaskLHS] =
+        getMask(AndOp->getOperand(0), Factor, LeafValueEC);
+    auto [MaskRHS, GapMaskRHS] =
+        getMask(AndOp->getOperand(1), Factor, LeafValueEC);
+    if (!MaskLHS || !MaskRHS)
+      return {nullptr, GapMask};
+    // Using IRBuilder here so that any trivial constants could be folded right
+    // away.
+    return {IRBuilder<>(AndOp).CreateAnd(MaskLHS, MaskRHS),
+            GapMaskLHS & GapMaskRHS};
+  }
+
   if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
     if (auto *Splat = ConstMask->getSplatValue())
       // All-ones or all-zeros mask.
-      return ConstantVector::getSplat(LeafValueEC, Splat);
+      return {ConstantVector::getSplat(LeafValueEC, Splat), GapMask};
 
     if (LeafValueEC.isFixed()) {
       unsigned LeafMaskLen = LeafValueEC.getFixedValue();
+      // First, check if we use a gap mask to skip some of the factors / fields.
+      getGapMask(*ConstMask, Factor, LeafMaskLen, GapMask);
+
       SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
       // If this is a fixed-length constant mask, each lane / leaf has to
       // use the same mask. This is done by checking if every group with Factor
       // number of elements in the interleaved mask has homogeneous values.
       for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
+        if (!GapMask[Idx % Factor])
+          continue;
         Constant *C = ConstMask->getAggregateElement(Idx);
         if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
-          return nullptr;
+          return {nullptr, GapMask};
         LeafMask[Idx / Factor] = C;
       }
 
-      return ConstantVector::get(LeafMask);
+      return {ConstantVector::get(LeafMask), GapMask};
     }
   }
 
@@ -603,12 +677,13 @@ static Value *getMask(Value *WideMask, unsigned Factor,
       auto *LeafMaskTy =
           VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC);
       IRBuilder<> Builder(SVI);
-      return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
-                                         uint64_t(0));
+      return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
+                                          uint64_t(0)),
+              GapMask};
     }
   }
 
-  return nullptr;
+  return {nullptr, GapMask};
 }
 
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
@@ -639,9 +714,16 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
       return false;
 
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
+    APInt GapMask(Factor, 0);
+    std::tie(Mask, GapMask) =
+        getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
     if (!Mask)
       return false;
+    // We haven't supported gap mask if it's deinterleaving using intrinsics.
+    // Yet it is possible that we already changed the IR, hence returning true
+    // here.
+    if (GapMask.popcount() != Factor)
+      return true;
 
     LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
                       << " intrinsic " << *DI << " and factor = "
@@ -680,10 +762,16 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
         II->getIntrinsicID() != Intrinsic::vp_store)
       return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    Mask = getMask(getMaskOperand(II), Factor,
-                   cast<VectorType>(InterleaveValues[0]->getType()));
+    APInt GapMask(Factor, 0);
+    std::tie(Mask, GapMask) =
+        getMask(getMaskOperand(II), Factor,
+                cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
+    // We haven't supported gap mask if it's interleaving using intrinsics. Yet
+    // it is possible that we already changed the IR, hence returning true here.
+    if (GapMask.popcount() != Factor)
+      return true;
 
     LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
                       << " intrinsic " << *IntII << " and factor = "
diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp
index 1f23418..c5dfdda 100644
--- a/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/llvm/lib/CodeGen/LiveVariables.cpp
@@ -213,11 +213,7 @@ void LiveVariables::HandleVirtRegDef(Register Reg, MachineInstr &MI) {
 }
 
 /// FindLastPartialDef - Return the last partial def of the specified register.
-/// Also returns the sub-registers that're defined by the instruction.
-MachineInstr *
-LiveVariables::FindLastPartialDef(Register Reg,
-                                  SmallSet<Register, 4> &PartDefRegs) {
-  Register LastDefReg = 0;
+MachineInstr *LiveVariables::FindLastPartialDef(Register Reg) {
   unsigned LastDefDist = 0;
   MachineInstr *LastDef = nullptr;
   for (MCPhysReg SubReg : TRI->subregs(Reg)) {
@@ -226,7 +222,6 @@ LiveVariables::FindLastPartialDef(Register Reg,
       continue;
     unsigned Dist = DistanceMap[Def];
     if (Dist > LastDefDist) {
-      LastDefReg  = SubReg;
       LastDef     = Def;
       LastDefDist = Dist;
     }
@@ -235,14 +230,6 @@ LiveVariables::FindLastPartialDef(Register Reg,
   if (!LastDef)
     return nullptr;
 
-  PartDefRegs.insert(LastDefReg);
-  for (MachineOperand &MO : LastDef->all_defs()) {
-    if (MO.getReg() == 0)
-      continue;
-    Register DefReg = MO.getReg();
-    if (TRI->isSubRegister(Reg, DefReg))
-      PartDefRegs.insert_range(TRI->subregs_inclusive(DefReg));
-  }
   return LastDef;
 }
 
@@ -261,27 +248,11 @@ void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) {
     // ...
     //    = EAX
     // All of the sub-registers must have been defined before the use of Reg!
-    SmallSet<Register, 4> PartDefRegs;
-    MachineInstr *LastPartialDef = FindLastPartialDef(Reg, PartDefRegs);
+    MachineInstr *LastPartialDef = FindLastPartialDef(Reg);
     // If LastPartialDef is NULL, it must be using a livein register.
     if (LastPartialDef) {
-      LastPartialDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
-                                                           true/*IsImp*/));
-      PhysRegDef[Reg.id()] = LastPartialDef;
-      SmallSet<MCPhysReg, 8> Processed;
-      for (MCPhysReg SubReg : TRI->subregs(Reg)) {
-        if (Processed.count(SubReg))
-          continue;
-        if (PartDefRegs.count(SubReg))
-          continue;
-        // This part of Reg was defined before the last partial def. It's killed
-        // here.
-        LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg,
-                                                             false/*IsDef*/,
-                                                             true/*IsImp*/));
-        PhysRegDef[SubReg] = LastPartialDef;
-        Processed.insert_range(TRI->subregs(SubReg));
-      }
+      LastPartialDef->addOperand(
+          MachineOperand::CreateReg(Reg, /*IsDef=*/true, /*IsImp=*/true));
     }
   } else if (LastDef && !PhysRegUse[Reg.id()] &&
              !LastDef->findRegisterDefOperand(Reg, /*TRI=*/nullptr))
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 3e99e57..bb70e78 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -124,6 +124,11 @@ public:
   bool initializeFrameInfo(PerFunctionMIParsingState &PFS,
                            const yaml::MachineFunction &YamlMF);
 
+  bool initializeSaveRestorePoints(
+      PerFunctionMIParsingState &PFS,
+      const std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints,
+      SmallVectorImpl<MachineBasicBlock *> &SaveRestorePoints);
+
   bool initializeCallSiteInfo(PerFunctionMIParsingState &PFS,
                               const yaml::MachineFunction &YamlMF);
 
@@ -529,7 +534,7 @@ void MIRParserImpl::setupDebugValueTracking(
   unsigned MaxInstrNum = 0;
   for (auto &MBB : MF)
     for (auto &MI : MBB)
-      MaxInstrNum = std::max((unsigned)MI.peekDebugInstrNum(), MaxInstrNum);
+      MaxInstrNum = std::max(MI.peekDebugInstrNum(), MaxInstrNum);
   MF.setDebugInstrNumberingCount(MaxInstrNum);
 
   // Load any substitutions.
@@ -867,18 +872,14 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
   MFI.setHasTailCall(YamlMFI.HasTailCall);
   MFI.setCalleeSavedInfoValid(YamlMFI.IsCalleeSavedInfoValid);
   MFI.setLocalFrameSize(YamlMFI.LocalFrameSize);
-  if (!YamlMFI.SavePoint.Value.empty()) {
-    MachineBasicBlock *MBB = nullptr;
-    if (parseMBBReference(PFS, MBB, YamlMFI.SavePoint))
-      return true;
-    MFI.setSavePoint(MBB);
-  }
-  if (!YamlMFI.RestorePoint.Value.empty()) {
-    MachineBasicBlock *MBB = nullptr;
-    if (parseMBBReference(PFS, MBB, YamlMFI.RestorePoint))
-      return true;
-    MFI.setRestorePoint(MBB);
-  }
+  SmallVector<MachineBasicBlock *, 4> SavePoints;
+  if (initializeSaveRestorePoints(PFS, YamlMFI.SavePoints, SavePoints))
+    return true;
+  MFI.setSavePoints(SavePoints);
+  SmallVector<MachineBasicBlock *, 4> RestorePoints;
+  if (initializeSaveRestorePoints(PFS, YamlMFI.RestorePoints, RestorePoints))
+    return true;
+  MFI.setRestorePoints(RestorePoints);
 
   std::vector<CalleeSavedInfo> CSIInfo;
   // Initialize the fixed frame objects.
@@ -1093,6 +1094,21 @@ bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS,
   return false;
 }
 
+// Return true if basic block was incorrectly specified in MIR
+bool MIRParserImpl::initializeSaveRestorePoints(
+    PerFunctionMIParsingState &PFS,
+    const std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints,
+    SmallVectorImpl<MachineBasicBlock *> &SaveRestorePoints) {
+  MachineBasicBlock *MBB = nullptr;
+  for (const yaml::SaveRestorePointEntry &Entry : YamlSRPoints) {
+    if (parseMBBReference(PFS, MBB, Entry.Point.Value))
+      return true;
+    SaveRestorePoints.push_back(MBB);
+  }
+
+  return false;
+}
+
 bool MIRParserImpl::initializeJumpTableInfo(PerFunctionMIParsingState &PFS,
     const yaml::MachineJumpTable &YamlJTI) {
   MachineJumpTableInfo *JTI = PFS.MF.getOrCreateJumpTableInfo(YamlJTI.Kind);
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index ce1834a..7cc9192 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -150,6 +150,10 @@ static void convertMJTI(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI,
                         const MachineJumpTableInfo &JTI);
 static void convertMFI(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI,
                        const MachineFrameInfo &MFI);
+static void
+convertSRPoints(ModuleSlotTracker &MST,
+                std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints,
+                ArrayRef<MachineBasicBlock *> SaveRestorePoints);
 static void convertStackObjects(yaml::MachineFunction &YMF,
                                 const MachineFunction &MF,
                                 ModuleSlotTracker &MST, MFPrintState &State);
@@ -355,14 +359,10 @@ static void convertMFI(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI,
   YamlMFI.HasTailCall = MFI.hasTailCall();
   YamlMFI.IsCalleeSavedInfoValid = MFI.isCalleeSavedInfoValid();
   YamlMFI.LocalFrameSize = MFI.getLocalFrameSize();
-  if (MFI.getSavePoint()) {
-    raw_string_ostream StrOS(YamlMFI.SavePoint.Value);
-    StrOS << printMBBReference(*MFI.getSavePoint());
-  }
-  if (MFI.getRestorePoint()) {
-    raw_string_ostream StrOS(YamlMFI.RestorePoint.Value);
-    StrOS << printMBBReference(*MFI.getRestorePoint());
-  }
+  if (!MFI.getSavePoints().empty())
+    convertSRPoints(MST, YamlMFI.SavePoints, MFI.getSavePoints());
+  if (!MFI.getRestorePoints().empty())
+    convertSRPoints(MST, YamlMFI.RestorePoints, MFI.getRestorePoints());
 }
 
 static void convertEntryValueObjects(yaml::MachineFunction &YMF,
@@ -616,6 +616,21 @@ static void convertMCP(yaml::MachineFunction &MF,
   }
 }
 
+static void
+convertSRPoints(ModuleSlotTracker &MST,
+                std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints,
+                ArrayRef<MachineBasicBlock *> SRPoints) {
+  for (const auto &MBB : SRPoints) {
+    SmallString<16> Str;
+    yaml::SaveRestorePointEntry Entry;
+    raw_svector_ostream StrOS(Str);
+    StrOS << printMBBReference(*MBB);
+    Entry.Point = StrOS.str().str();
+    Str.clear();
+    YamlSRPoints.push_back(Entry);
+  }
+}
+
 static void convertMJTI(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI,
                         const MachineJumpTableInfo &JTI) {
   YamlJTI.Kind = JTI.getEntryKind();
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 742de11..e359831 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -490,7 +490,7 @@ private:
   SmallSetVector<MachineInstr *, 8> MaybeDeadCopies;
 
   /// Multimap tracking debug users in current BB
-  DenseMap<MachineInstr *, SmallSet<MachineInstr *, 2>> CopyDbgUsers;
+  DenseMap<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> CopyDbgUsers;
 
   CopyTracker Tracker;
 
diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp
index 1a20fe5..307f494 100644
--- a/llvm/lib/CodeGen/MachineDebugify.cpp
+++ b/llvm/lib/CodeGen/MachineDebugify.cpp
@@ -87,7 +87,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
   // Do this by introducing debug uses of each register definition. If that is
   // not possible (e.g. we have a phi or a meta instruction), emit a constant.
   uint64_t NextImm = 0;
-  SmallSet<DILocalVariable *, 16> VarSet;
+  SmallPtrSet<DILocalVariable *, 16> VarSet;
   const MCInstrDesc &DbgValDesc = TII.get(TargetOpcode::DBG_VALUE);
   for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator FirstNonPHIIt = MBB.getFirstNonPHI();
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index e4b9938..a8306b2 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -244,6 +244,22 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
     }
     OS << "\n";
   }
+  OS << "save/restore points:\n";
+
+  if (!SavePoints.empty()) {
+    OS << "save points:\n";
+
+    for (auto &item : SavePoints)
+      OS << printMBBReference(*item) << "\n";
+  } else
+    OS << "save points are empty\n";
+
+  if (!RestorePoints.empty()) {
+    OS << "restore points:\n";
+    for (auto &item : RestorePoints)
+      OS << printMBBReference(*item) << "\n";
+  } else
+    OS << "restore points are empty\n";
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index ec40f6a..82ba596 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -154,17 +154,17 @@ void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
   MBB->getParent()->deleteMachineBasicBlock(MBB);
 }
 
-static inline Align getFnStackAlignment(const TargetSubtargetInfo *STI,
-                                           const Function &F) {
+static inline Align getFnStackAlignment(const TargetSubtargetInfo &STI,
+                                        const Function &F) {
   if (auto MA = F.getFnStackAlign())
     return *MA;
-  return STI->getFrameLowering()->getStackAlign();
+  return STI.getFrameLowering()->getStackAlign();
 }
 
 MachineFunction::MachineFunction(Function &F, const TargetMachine &Target,
                                  const TargetSubtargetInfo &STI, MCContext &Ctx,
                                  unsigned FunctionNum)
-    : F(F), Target(Target), STI(&STI), Ctx(Ctx) {
+    : F(F), Target(Target), STI(STI), Ctx(Ctx) {
   FunctionNumber = FunctionNum;
   init();
 }
@@ -195,7 +195,7 @@ void MachineFunction::init() {
 
   // We can realign the stack if the target supports it and the user hasn't
   // explicitly asked us not to.
-  bool CanRealignSP = STI->getFrameLowering()->isStackRealignable() &&
+  bool CanRealignSP = STI.getFrameLowering()->isStackRealignable() &&
                       !F.hasFnAttribute("no-realign-stack");
   bool ForceRealignSP = F.hasFnAttribute(Attribute::StackAlignment) ||
                         F.hasFnAttribute("stackrealign");
@@ -209,11 +209,11 @@ void MachineFunction::init() {
     FrameInfo->ensureMaxAlignment(*F.getFnStackAlign());
 
   ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
-  Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
+  Alignment = STI.getTargetLowering()->getMinFunctionAlignment();
 
   if (!F.getAlign() && !F.hasOptSize())
     Alignment = std::max(Alignment,
-                         STI->getTargetLowering()->getPrefFunctionAlignment());
+                         STI.getTargetLowering()->getPrefFunctionAlignment());
 
   // -fsanitize=function and -fsanitize=kcfi instrument indirect function calls
   // to load a type hash before the function label. Ensure functions are aligned
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 4da0184..d9e8484 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -94,6 +94,22 @@ static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI,
   return DebugLoc();
 }
 
+/// Check if target reg is contained in given lists, which are:
+/// LocalDefsV as given list for virtual regs
+/// LocalDefsP as given list for physical regs, in BitVector[RegUnit] form
+static bool containsReg(SmallSetVector<Register, 32> LocalDefsV,
+                        const BitVector &LocalDefsP, Register Reg,
+                        const TargetRegisterInfo *TRI) {
+  if (Reg.isPhysical()) {
+    for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+      if (!LocalDefsP[Unit])
+        return false;
+
+    return true;
+  }
+  return LocalDefsV.contains(Reg);
+}
+
 /// finalizeBundle - Finalize a machine instruction bundle which includes
 /// a sequence of instructions starting from FirstMI to LastMI (exclusive).
 /// This routine adds a BUNDLE instruction to represent the bundle, it adds
@@ -115,6 +131,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   Bundle.prepend(MIB);
 
   SmallSetVector<Register, 32> LocalDefs;
+  BitVector LocalDefsP(TRI->getNumRegUnits());
   SmallSet<Register, 8> DeadDefSet;
   SmallSet<Register, 16> KilledDefSet;
   SmallSetVector<Register, 8> ExternUses;
@@ -130,7 +147,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       if (!Reg)
         continue;
 
-      if (LocalDefs.contains(Reg)) {
+      if (containsReg(LocalDefs, LocalDefsP, Reg, TRI)) {
         MO.setIsInternalRead();
         if (MO.isKill()) {
           // Internal def is now killed.
@@ -165,8 +182,10 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
         }
       }
 
-      if (!MO.isDead() && Reg.isPhysical())
-        LocalDefs.insert_range(TRI->subregs(Reg));
+      if (!MO.isDead() && Reg.isPhysical()) {
+        for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+          LocalDefsP.set(Unit);
+      }
     }
 
     // Set FrameSetup/FrameDestroy for the bundle. If any of the instructions
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 90005bd..3a9651c 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -3466,9 +3466,9 @@ bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds(
 }
 
 /// Determine transitive dependences of unpipelineable instructions
-SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
+SmallPtrSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
     SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
-  SmallSet<SUnit *, 8> DoNotPipeline;
+  SmallPtrSet<SUnit *, 8> DoNotPipeline;
   SmallVector<SUnit *, 8> Worklist;
 
   for (auto &SU : SSD->SUnits)
@@ -3498,7 +3498,7 @@ SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
 // and ensure that they are in stage 0.  If unable to do so, return false.
 bool SMSchedule::normalizeNonPipelinedInstructions(
     SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
-  SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
+  SmallPtrSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
 
   int NewLastCycle = INT_MIN;
   for (SUnit &SU : SSD->SUnits) {
diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp
index 975a3fe..1db5301 100644
--- a/llvm/lib/CodeGen/MacroFusion.cpp
+++ b/llvm/lib/CodeGen/MacroFusion.cpp
@@ -79,7 +79,7 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
   FirstSU.ParentClusterIdx = Clusters.size();
   SecondSU.ParentClusterIdx = Clusters.size();
 
-  SmallSet<SUnit *, 8> Cluster{{&FirstSU, &SecondSU}};
+  SmallPtrSet<SUnit *, 8> Cluster{{&FirstSU, &SecondSU}};
   Clusters.push_back(Cluster);
 
   // TODO - If we want to chain more than two instructions, we need to create
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index a93a89e..34a9d5d 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -72,6 +73,7 @@ class PHIEliminationImpl {
   LiveIntervals *LIS = nullptr;
   MachineLoopInfo *MLI = nullptr;
   MachineDominatorTree *MDT = nullptr;
+  MachinePostDominatorTree *PDT = nullptr;
 
   /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions
   /// in predecessor basic blocks.
@@ -123,17 +125,22 @@ public:
     auto *MLIWrapper = P->getAnalysisIfAvailable<MachineLoopInfoWrapperPass>();
     auto *MDTWrapper =
         P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+    auto *PDTWrapper =
+        P->getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
     LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
     LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
     MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr;
     MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
+    PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
   }
 
   PHIEliminationImpl(MachineFunction &MF, MachineFunctionAnalysisManager &AM)
       : LV(AM.getCachedResult<LiveVariablesAnalysis>(MF)),
         LIS(AM.getCachedResult<LiveIntervalsAnalysis>(MF)),
         MLI(AM.getCachedResult<MachineLoopAnalysis>(MF)),
-        MDT(AM.getCachedResult<MachineDominatorTreeAnalysis>(MF)), MFAM(&AM) {}
+        MDT(AM.getCachedResult<MachineDominatorTreeAnalysis>(MF)),
+        PDT(AM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF)),
+        MFAM(&AM) {}
 
   bool run(MachineFunction &MF);
 };
@@ -172,6 +179,7 @@ PHIEliminationPass::run(MachineFunction &MF,
   PA.preserve<LiveVariablesAnalysis>();
   PA.preserve<SlotIndexesAnalysis>();
   PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<MachinePostDominatorTreeAnalysis>();
   PA.preserve<MachineLoopAnalysis>();
   return PA;
 }
@@ -197,6 +205,7 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<SlotIndexesWrapperPass>();
   AU.addPreserved<LiveIntervalsWrapperPass>();
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -204,15 +213,8 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
 bool PHIEliminationImpl::run(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
 
-  MachineDominatorTree *MDT = nullptr;
-  if (P) {
-    auto *MDTWrapper =
-        P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
-    MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
-  } else {
-    MDT = MFAM->getCachedResult<MachineDominatorTreeAnalysis>(MF);
-  }
-  MachineDomTreeUpdater MDTU(MDT, MachineDomTreeUpdater::UpdateStrategy::Lazy);
+  MachineDomTreeUpdater MDTU(MDT, PDT,
+                             MachineDomTreeUpdater::UpdateStrategy::Lazy);
 
   bool Changed = false;
 
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 8de2c48..96c9cde6 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -145,7 +145,7 @@ static bool lowerObjCCall(Function &F, RTLIB::LibcallImpl NewFn,
 
   // FIXME: When RuntimeLibcalls is an analysis, check if the function is really
   // supported, and go through RTLIB::Libcall.
-  const char *NewFnName = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(NewFn);
+  StringRef NewFnName = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(NewFn);
 
   // If we haven't already looked up this function, check to see if the
   // program already contains a function with this name.
@@ -587,12 +587,14 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
       break;
     case Intrinsic::exp:
     case Intrinsic::exp2:
+    case Intrinsic::log:
       Changed |= forEachCall(F, [&](CallInst *CI) {
         Type *Ty = CI->getArgOperand(0)->getType();
         if (!isa<ScalableVectorType>(Ty))
           return false;
         const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
         unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID());
+        assert(Op != ISD::DELETED_NODE && "unsupported intrinsic");
         if (!TL->isOperationExpand(Op, EVT::getEVT(Ty)))
           return false;
         return lowerUnaryVectorIntrinsicAsLoop(M, CI);
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index f66f546..8fc0748 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -351,8 +351,8 @@ bool PEIImpl::run(MachineFunction &MF) {
   delete RS;
   SaveBlocks.clear();
   RestoreBlocks.clear();
-  MFI.setSavePoint(nullptr);
-  MFI.setRestorePoint(nullptr);
+  MFI.setSavePoints({});
+  MFI.setRestorePoints({});
   return true;
 }
 
@@ -423,16 +423,18 @@ void PEIImpl::calculateCallFrameInfo(MachineFunction &MF) {
 /// callee-saved registers, and placing prolog and epilog code.
 void PEIImpl::calculateSaveRestoreBlocks(MachineFunction &MF) {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-
   // Even when we do not change any CSR, we still want to insert the
   // prologue and epilogue of the function.
   // So set the save points for those.
 
   // Use the points found by shrink-wrapping, if any.
-  if (MFI.getSavePoint()) {
-    SaveBlocks.push_back(MFI.getSavePoint());
-    assert(MFI.getRestorePoint() && "Both restore and save must be set");
-    MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+  if (!MFI.getSavePoints().empty()) {
+    assert(MFI.getSavePoints().size() == 1 &&
+           "Multiple save points are not yet supported!");
+    SaveBlocks.push_back(MFI.getSavePoints().front());
+    assert(MFI.getRestorePoints().size() == 1 &&
+           "Multiple restore points are not yet supported!");
+    MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front();
     // If RestoreBlock does not have any successor and is not a return block
     // then the end point is unreachable and we do not need to insert any
     // epilogue.
@@ -558,7 +560,11 @@ static void updateLiveness(MachineFunction &MF) {
   SmallPtrSet<MachineBasicBlock *, 8> Visited;
   SmallVector<MachineBasicBlock *, 8> WorkList;
   MachineBasicBlock *Entry = &MF.front();
-  MachineBasicBlock *Save = MFI.getSavePoint();
+
+  assert(MFI.getSavePoints().size() < 2 &&
+         "Multiple save points not yet supported!");
+  MachineBasicBlock *Save =
+      MFI.getSavePoints().empty() ? nullptr : MFI.getSavePoints().front();
 
   if (!Save)
     Save = Entry;
@@ -569,7 +575,10 @@ static void updateLiveness(MachineFunction &MF) {
   }
   Visited.insert(Save);
 
-  MachineBasicBlock *Restore = MFI.getRestorePoint();
+  assert(MFI.getRestorePoints().size() < 2 &&
+         "Multiple restore points not yet supported!");
+  MachineBasicBlock *Restore =
+      MFI.getRestorePoints().empty() ? nullptr : MFI.getRestorePoints().front();
   if (Restore)
     // By construction Restore cannot be visited, otherwise it
     // means there exists a path to Restore that does not go
@@ -1550,7 +1559,7 @@ void PEIImpl::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
       // If this instruction has a FrameIndex operand, we need to
       // use that target machine register info object to eliminate
       // it.
-      TRI.eliminateFrameIndex(MI, SPAdj, i);
+      TRI.eliminateFrameIndex(MI, SPAdj, i, RS);
 
       // Reset the iterator if we were at the beginning of the BB.
       if (AtBeginning) {
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 66a206c..804480c 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -474,6 +474,13 @@ int RegAllocFastImpl::getStackSpaceFor(Register VirtReg) {
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   unsigned Size = TRI->getSpillSize(RC);
   Align Alignment = TRI->getSpillAlign(RC);
+
+  const MachineFunction &MF = MRI->getMF();
+  auto &ST = MF.getSubtarget();
+  Align CurrentAlign = ST.getFrameLowering()->getStackAlign();
+  if (Alignment > CurrentAlign && !TRI->canRealignStack(MF))
+    Alignment = CurrentAlign;
+
   int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment);
 
   // Assign the slot.
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index ca51b67..5f37890 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -1001,7 +1001,7 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
         ++CritIdx;
 
       if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) {
-        int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        int PDiff = (int)PNew - CriticalPSets[CritIdx].getUnitInc();
         if (PDiff > 0) {
           Delta.CriticalMax = PressureChange(i);
           Delta.CriticalMax.setUnitInc(PDiff);
@@ -1191,7 +1191,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
         ++CritIdx;
 
       if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
-        int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        int CritInc = (int)MNew - CriticalPSets[CritIdx].getUnitInc();
         if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) {
           Delta.CriticalMax = PressureChange(PSetID);
           Delta.CriticalMax.setUnitInc(CritInc);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7341914..8446045 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -401,7 +401,7 @@ namespace {
     SDValue PromoteExtend(SDValue Op);
     bool PromoteLoad(SDValue Op);
 
-    SDValue foldShiftToAvg(SDNode *N);
+    SDValue foldShiftToAvg(SDNode *N, const SDLoc &DL);
     // Fold `a bitwiseop (~b +/- c)` -> `a bitwiseop ~(b -/+ c)`
     SDValue foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT);
 
@@ -10983,7 +10983,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   if (SDValue NarrowLoad = reduceLoadWidth(N))
     return NarrowLoad;
 
-  if (SDValue AVG = foldShiftToAvg(N))
+  if (SDValue AVG = foldShiftToAvg(N, DL))
     return AVG;
 
   return SDValue();
@@ -11256,7 +11256,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (SDValue MULH = combineShiftToMULH(N, DL, DAG, TLI))
     return MULH;
 
-  if (SDValue AVG = foldShiftToAvg(N))
+  if (SDValue AVG = foldShiftToAvg(N, DL))
     return AVG;
 
   return SDValue();
@@ -11772,51 +11772,36 @@ static SDValue combineMinNumMaxNumImpl(const SDLoc &DL, EVT VT, SDValue LHS,
   }
 }
 
-SDValue DAGCombiner::foldShiftToAvg(SDNode *N) {
+// Convert (sr[al] (add n[su]w x, y)) -> (avgfloor[su] x, y)
+SDValue DAGCombiner::foldShiftToAvg(SDNode *N, const SDLoc &DL) {
   const unsigned Opcode = N->getOpcode();
-
-  // Convert (sr[al] (add n[su]w x, y)) -> (avgfloor[su] x, y)
   if (Opcode != ISD::SRA && Opcode != ISD::SRL)
     return SDValue();
 
-  unsigned FloorISD = 0;
-  auto VT = N->getValueType(0);
-  bool IsUnsigned = false;
-
-  // Decide wether signed or unsigned.
-  switch (Opcode) {
-  case ISD::SRA:
-    if (!hasOperation(ISD::AVGFLOORS, VT))
-      return SDValue();
-    FloorISD = ISD::AVGFLOORS;
-    break;
-  case ISD::SRL:
-    IsUnsigned = true;
-    if (!hasOperation(ISD::AVGFLOORU, VT))
-      return SDValue();
-    FloorISD = ISD::AVGFLOORU;
-    break;
-  default:
-    return SDValue();
-  }
+  EVT VT = N->getValueType(0);
+  bool IsUnsigned = Opcode == ISD::SRL;
 
   // Captured values.
   SDValue A, B, Add;
 
   // Match floor average as it is common to both floor/ceil avgs.
-  if (!sd_match(N, m_BinOp(Opcode,
-                           m_AllOf(m_Value(Add), m_Add(m_Value(A), m_Value(B))),
-                           m_One())))
-    return SDValue();
+  if (sd_match(N, m_BinOp(Opcode,
+                          m_AllOf(m_Value(Add), m_Add(m_Value(A), m_Value(B))),
+                          m_One()))) {
+    // Decide whether signed or unsigned.
+    unsigned FloorISD = IsUnsigned ? ISD::AVGFLOORU : ISD::AVGFLOORS;
+    if (!hasOperation(FloorISD, VT))
+      return SDValue();
 
-  // Can't optimize adds that may wrap.
-  if (IsUnsigned && !Add->getFlags().hasNoUnsignedWrap())
-    return SDValue();
+    // Can't optimize adds that may wrap.
+    if ((IsUnsigned && !Add->getFlags().hasNoUnsignedWrap()) ||
+        (!IsUnsigned && !Add->getFlags().hasNoSignedWrap()))
+      return SDValue();
 
-  if (!IsUnsigned && !Add->getFlags().hasNoSignedWrap())
-    return SDValue();
+    return DAG.getNode(FloorISD, DL, N->getValueType(0), {A, B});
+  }
 
-  return DAG.getNode(FloorISD, SDLoc(N), N->getValueType(0), {A, B});
+  return SDValue();
 }
 
 SDValue DAGCombiner::foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT) {
@@ -12843,22 +12828,21 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
   SDLoc DL(HG);
 
   EVT MemVT = HG->getMemoryVT();
+  EVT DataVT = Index.getValueType();
   MachineMemOperand *MMO = HG->getMemOperand();
   ISD::MemIndexType IndexType = HG->getIndexType();
 
   if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
     return Chain;
 
-  SDValue Ops[] = {Chain,          Inc,           Mask, BasePtr, Index,
-                   HG->getScale(), HG->getIntID()};
-  if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL))
+  if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL) ||
+      refineIndexType(Index, IndexType, DataVT, DAG)) {
+    SDValue Ops[] = {Chain,          Inc,           Mask, BasePtr, Index,
+                     HG->getScale(), HG->getIntID()};
     return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
                                   MMO, IndexType);
+  }
 
-  EVT DataVT = Index.getValueType();
-  if (refineIndexType(Index, IndexType, DataVT, DAG))
-    return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
-                                  MMO, IndexType);
   return SDValue();
 }
 
@@ -16343,6 +16327,42 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
                                  DAG, DL);
     }
     break;
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:
+  case ISD::ABDS:
+  case ISD::ABDU:
+    // (trunc (avg a, b)) -> (avg (trunc a), (trunc b))
+    // (trunc (abdu/abds a, b)) -> (abdu/abds (trunc a), (trunc b))
+    if (!LegalOperations && N0.hasOneUse() &&
+        TLI.isOperationLegal(N0.getOpcode(), VT)) {
+      EVT TruncVT = VT;
+      unsigned SrcBits = SrcVT.getScalarSizeInBits();
+      unsigned TruncBits = TruncVT.getScalarSizeInBits();
+
+      SDValue A = N0.getOperand(0);
+      SDValue B = N0.getOperand(1);
+      bool CanFold = false;
+
+      if (N0.getOpcode() == ISD::AVGFLOORU || N0.getOpcode() == ISD::AVGCEILU ||
+          N0.getOpcode() == ISD::ABDU) {
+        APInt UpperBits = APInt::getBitsSetFrom(SrcBits, TruncBits);
+        CanFold = DAG.MaskedValueIsZero(B, UpperBits) &&
+                  DAG.MaskedValueIsZero(A, UpperBits);
+      } else {
+        unsigned NeededBits = SrcBits - TruncBits;
+        CanFold = DAG.ComputeNumSignBits(B) > NeededBits &&
+                  DAG.ComputeNumSignBits(A) > NeededBits;
+      }
+
+      if (CanFold) {
+        SDValue NewA = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, A);
+        SDValue NewB = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, B);
+        return DAG.getNode(N0.getOpcode(), DL, TruncVT, NewA, NewB);
+      }
+    }
+    break;
   }
 
   return SDValue();
@@ -25987,7 +26007,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
   // Combine an extract of an extract into a single extract_subvector.
   // ext (ext X, C), 0 --> ext X, C
   if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
-    if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
+    // The index has to be a multiple of the new result type's known minimum
+    // vector length.
+    if (V.getConstantOperandVal(1) % NVT.getVectorMinNumElements() == 0 &&
+        TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
                                     V.getConstantOperandVal(1)) &&
         TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, V.getOperand(0),
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index fb9eff9..9467ba1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -729,9 +729,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
 
     assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
 
-    ArgListEntry Entry;
-    Entry.Val = V;
-    Entry.Ty = V->getType();
+    ArgListEntry Entry(V);
     Entry.setAttributes(CI, ArgI);
     Args.push_back(Entry);
   }
@@ -978,9 +976,7 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
 
     assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
 
-    ArgListEntry Entry;
-    Entry.Val = V;
-    Entry.Ty = V->getType();
+    ArgListEntry Entry(V);
     Entry.setAttributes(CI, ArgI);
     Args.push_back(Entry);
   }
@@ -1012,17 +1008,16 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
     MVT RegisterVT = TLI.getRegisterType(CLI.RetTy->getContext(), VT);
     unsigned NumRegs = TLI.getNumRegisters(CLI.RetTy->getContext(), VT);
     for (unsigned i = 0; i != NumRegs; ++i) {
-      ISD::InputArg MyFlags;
-      MyFlags.VT = RegisterVT;
-      MyFlags.ArgVT = VT;
-      MyFlags.Used = CLI.IsReturnValueUsed;
+      ISD::ArgFlagsTy Flags;
       if (CLI.RetSExt)
-        MyFlags.Flags.setSExt();
+        Flags.setSExt();
       if (CLI.RetZExt)
-        MyFlags.Flags.setZExt();
+        Flags.setZExt();
       if (CLI.IsInReg)
-        MyFlags.Flags.setInReg();
-      CLI.Ins.push_back(MyFlags);
+        Flags.setInReg();
+      ISD::InputArg Ret(Flags, RegisterVT, VT, CLI.RetTy, CLI.IsReturnValueUsed,
+                        ISD::InputArg::NoArgIndex, 0);
+      CLI.Ins.push_back(Ret);
     }
   }
 
@@ -1117,7 +1112,6 @@ bool FastISel::lowerCall(const CallInst *CI) {
   Type *RetTy = CI->getType();
 
   ArgListTy Args;
-  ArgListEntry Entry;
   Args.reserve(CI->arg_size());
 
   for (auto i = CI->arg_begin(), e = CI->arg_end(); i != e; ++i) {
@@ -1127,9 +1121,7 @@ bool FastISel::lowerCall(const CallInst *CI) {
     if (V->getType()->isEmptyTy())
       continue;
 
-    Entry.Val = V;
-    Entry.Ty = V->getType();
-
+    ArgListEntry Entry(V);
     // Skip the first return-type Attribute to get to params.
     Entry.setAttributes(CI, i - CI->arg_begin());
     Args.push_back(Entry);
@@ -1148,9 +1140,12 @@ bool FastISel::lowerCall(const CallInst *CI) {
   CLI.setCallee(RetTy, FuncTy, CI->getCalledOperand(), std::move(Args), *CI)
       .setTailCall(IsTailCall);
 
-  diagnoseDontCall(*CI);
+  if (lowerCallTo(CLI)) {
+    diagnoseDontCall(*CI);
+    return true;
+  }
 
-  return lowerCallTo(CLI);
+  return false;
 }
 
 bool FastISel::selectCall(const User *I) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 8c8daef..1a63518 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -81,12 +81,11 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
 
 /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
 /// implicit physical register output.
-void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
-                                   Register SrcReg, VRBaseMapType &VRBaseMap) {
+void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
+                                   VRBaseMapType &VRBaseMap) {
   Register VRBase;
   if (SrcReg.isVirtual()) {
     // Just use the input register directly!
-    SDValue Op(Node, ResNo);
     if (IsClone)
       VRBaseMap.erase(Op);
     bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second;
@@ -99,17 +98,15 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
   // the CopyToReg'd destination register instead of creating a new vreg.
   bool MatchReg = true;
   const TargetRegisterClass *UseRC = nullptr;
-  MVT VT = Node->getSimpleValueType(ResNo);
+  MVT VT = Op.getSimpleValueType();
 
   // Stick to the preferred register classes for legal types.
   if (TLI->isTypeLegal(VT))
-    UseRC = TLI->getRegClassFor(VT, Node->isDivergent());
+    UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
 
-  for (SDNode *User : Node->users()) {
+  for (SDNode *User : Op->users()) {
     bool Match = true;
-    if (User->getOpcode() == ISD::CopyToReg &&
-        User->getOperand(2).getNode() == Node &&
-        User->getOperand(2).getResNo() == ResNo) {
+    if (User->getOpcode() == ISD::CopyToReg && User->getOperand(2) == Op) {
       Register DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
       if (DestReg.isVirtual()) {
         VRBase = DestReg;
@@ -118,10 +115,8 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
         Match = false;
     } else {
       for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
-        SDValue Op = User->getOperand(i);
-        if (Op.getNode() != Node || Op.getResNo() != ResNo)
+        if (User->getOperand(i) != Op)
           continue;
-        MVT VT = Node->getSimpleValueType(Op.getResNo());
         if (VT == MVT::Other || VT == MVT::Glue)
           continue;
         Match = false;
@@ -170,11 +165,11 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
   } else {
     // Create the reg, emit the copy.
     VRBase = MRI->createVirtualRegister(DstRC);
-    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
-            VRBase).addReg(SrcReg);
+    BuildMI(*MBB, InsertPos, Op.getDebugLoc(), TII->get(TargetOpcode::COPY),
+            VRBase)
+        .addReg(SrcReg);
   }
 
-  SDValue Op(Node, ResNo);
   if (IsClone)
     VRBaseMap.erase(Op);
   bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
@@ -1170,7 +1165,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
         continue;
       // This implicitly defined physreg has a use.
       UsedRegs.push_back(Reg);
-      EmitCopyFromReg(Node, i, IsClone, Reg, VRBaseMap);
+      EmitCopyFromReg(SDValue(Node, i), IsClone, Reg, VRBaseMap);
     }
   }
 
@@ -1178,7 +1173,9 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) {
     for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) {
       if (F->getOpcode() == ISD::CopyFromReg) {
-        UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+        Register Reg = cast<RegisterSDNode>(F->getOperand(1))->getReg();
+        if (Reg.isPhysical())
+          UsedRegs.push_back(Reg);
         continue;
       } else if (F->getOpcode() == ISD::CopyToReg) {
         // Skip CopyToReg nodes that are internal to the glue chain.
@@ -1281,7 +1278,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
   }
   case ISD::CopyFromReg: {
     Register SrcReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
-    EmitCopyFromReg(Node, 0, IsClone, SrcReg, VRBaseMap);
+    EmitCopyFromReg(SDValue(Node, 0), IsClone, SrcReg, VRBaseMap);
     break;
   }
   case ISD::EH_LABEL:
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 16d754c..b465de8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -48,8 +48,8 @@ private:
 
   /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
   /// implicit physical register output.
-  void EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
-                       Register SrcReg, VRBaseMapType &VRBaseMap);
+  void EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
+                       VRBaseMapType &VRBaseMap);
 
   void CreateVirtualRegisters(SDNode *Node,
                               MachineInstrBuilder &MIB,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ba0ab23..bcfc2c5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2181,12 +2181,10 @@ SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
 std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
                                             bool isSigned) {
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   for (const SDValue &Op : Node->op_values()) {
     EVT ArgVT = Op.getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-    Entry.Node = Op;
-    Entry.Ty = ArgTy;
+    TargetLowering::ArgListEntry Entry(Op, ArgTy);
     Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, isSigned);
     Entry.IsZExt = !Entry.IsSExt;
     Args.push_back(Entry);
@@ -2325,11 +2323,9 @@ SDValue SelectionDAGLegalize::ExpandBitCountingLibCall(
   EVT IntVT =
       EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize());
 
-  TargetLowering::ArgListEntry Arg;
   EVT ArgVT = Op.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  Arg.Node = Op;
-  Arg.Ty = ArgTy;
+  TargetLowering::ArgListEntry Arg(Op, ArgTy);
   Arg.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, /*IsSigned=*/false);
   Arg.IsZExt = !Arg.IsSExt;
 
@@ -2370,12 +2366,10 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   for (const SDValue &Op : Node->op_values()) {
     EVT ArgVT = Op.getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-    Entry.Node = Op;
-    Entry.Ty = ArgTy;
+    TargetLowering::ArgListEntry Entry(Op, ArgTy);
     Entry.IsSExt = isSigned;
     Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
@@ -2383,8 +2377,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 
   // Also pass the return address of the remainder.
   SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
-  Entry.Node = FIPtr;
-  Entry.Ty = PointerType::getUnqual(RetTy->getContext());
+  TargetLowering::ArgListEntry Entry(
+      FIPtr, PointerType::getUnqual(RetTy->getContext()));
   Entry.IsSExt = isSigned;
   Entry.IsZExt = !isSigned;
   Args.push_back(Entry);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2cad36e..83bb1df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -197,7 +197,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC) {
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpVT = N->getOperand(0 + Offset).getValueType();
-  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -218,7 +218,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC) {
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
                    N->getOperand(1 + Offset).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -558,7 +558,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
   EVT OpsVT[3] = { N->getOperand(0 + Offset).getValueType(),
                    N->getOperand(1 + Offset).getValueType(),
                    N->getOperand(2 + Offset).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG,
                                                     GetFPLibCall(N->getValueType(0),
                                                                  RTLIB::FMA_F32,
@@ -642,7 +642,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
-  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -658,7 +658,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
   SDValue Op = N->getOperand(0);
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
   SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op,
                                   CallOptions, SDLoc(N)).first;
   if (N->getValueType(0) == MVT::f32)
@@ -694,7 +694,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
-  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -742,7 +742,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ExpOp(SDNode *N) {
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
                    N->getOperand(1 + Offset).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -779,7 +779,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFREXP(SDNode *N) {
 
   // TODO: setTypeListBeforeSoften can't properly express multiple return types,
   // but we only really need to handle the 0th one for softening anyway.
-  CallOptions.setTypeListBeforeSoften({OpsVT}, VT0, true)
+  CallOptions.setTypeListBeforeSoften({OpsVT}, VT0)
       .setOpsTypeOverrides(CallOpsTypeOverrides);
 
   auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT0, Ops, CallOptions, DL,
@@ -828,7 +828,7 @@ bool DAGTypeLegalizer::SoftenFloatRes_UnaryWithTwoFPResults(
   TargetLowering::MakeLibCallOptions CallOptions;
   // TODO: setTypeListBeforeSoften can't properly express multiple return types,
   // but since both returns have the same type it should be okay.
-  CallOptions.setTypeListBeforeSoften({OpsVT}, VT, true)
+  CallOptions.setTypeListBeforeSoften({OpsVT}, VT)
       .setOpsTypeOverrides(CallOpsTypeOverrides);
 
   auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT, Ops, CallOptions, DL,
@@ -1100,7 +1100,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
                            NVT, N->getOperand(IsStrict ? 1 : 0));
   TargetLowering::MakeLibCallOptions CallOptions;
   CallOptions.setIsSigned(Signed);
-  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  CallOptions.setTypeListBeforeSoften(SVT, RVT);
   std::pair<SDValue, SDValue> Tmp =
       TLI.makeLibCall(DAG, LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
                       Op, CallOptions, dl, Chain);
@@ -1222,7 +1222,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   Op = GetSoftenedFloat(Op);
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  CallOptions.setTypeListBeforeSoften(SVT, RVT);
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -1298,7 +1298,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   Op = GetSoftenedFloat(Op);
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  CallOptions.setTypeListBeforeSoften(SVT, RVT);
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, dl, Chain);
 
@@ -1453,7 +1453,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_Unary(SDNode *N, RTLIB::Libcall LC) {
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpVT = N->getOperand(0 + Offset).getValueType();
-  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -1551,6 +1551,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
 
   case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+  case ISD::AssertNoFPClass: ExpandFloatRes_AssertNoFPClass(N, Lo, Hi); break;
   case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
   case ISD::STRICT_FMINNUM:
   case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
@@ -1966,6 +1967,13 @@ void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo,
+                                                      SDValue &Hi) {
+  // TODO: Handle ppcf128 by preserving AssertNoFPClass for one of the halves.
+  SDLoc dl(N);
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -3559,7 +3567,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) {
     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
     Op = GetSoftenedFloat(Op);
     TargetLowering::MakeLibCallOptions CallOptions;
-    CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+    CallOptions.setTypeListBeforeSoften(SVT, RVT);
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N), Chain);
     if (IsStrict)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a5bd97a..90d62e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -5260,20 +5260,18 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
                    MachinePointerInfo());
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   for (const SDValue &Op : N->op_values()) {
     EVT ArgVT = Op.getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-    Entry.Node = Op;
-    Entry.Ty = ArgTy;
+    TargetLowering::ArgListEntry Entry(Op, ArgTy);
     Entry.IsSExt = true;
     Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
   // Also pass the address of the overflow check.
-  Entry.Node = Temp;
-  Entry.Ty = PointerType::getUnqual(PtrTy->getContext());
+  TargetLowering::ArgListEntry Entry(
+      Temp, PointerType::getUnqual(PtrTy->getContext()));
   Entry.IsSExt = true;
   Entry.IsZExt = false;
   Args.push_back(Entry);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 63544e6..33fa301 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -681,6 +681,7 @@ private:
       SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo = {});
 
   // clang-format off
+  void ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FACOS     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FASIN     (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index d2ecc133..2ca9895 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -2223,17 +2223,13 @@ bool VectorLegalizer::tryExpandVecMathCall(SDNode *Node, RTLIB::Libcall LC,
 
   SDLoc DL(Node);
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
 
   unsigned OpNum = 0;
   for (auto &VFParam : OptVFInfo->Shape.Parameters) {
     if (VFParam.ParamKind == VFParamKind::GlobalPredicate) {
       EVT MaskVT = TLI.getSetCCResultType(DAG.getDataLayout(), *Ctx, VT);
-      Entry.Node = DAG.getBoolConstant(true, DL, MaskVT, VT);
-      Entry.Ty = MaskVT.getTypeForEVT(*Ctx);
-      Args.push_back(Entry);
+      Args.emplace_back(DAG.getBoolConstant(true, DL, MaskVT, VT),
+                        MaskVT.getTypeForEVT(*Ctx));
       continue;
     }
 
@@ -2241,9 +2237,7 @@ bool VectorLegalizer::tryExpandVecMathCall(SDNode *Node, RTLIB::Libcall LC,
     if (VFParam.ParamKind != VFParamKind::Vector)
       return false;
 
-    Entry.Node = Node->getOperand(OpNum++);
-    Entry.Ty = Ty;
-    Args.push_back(Entry);
+    Args.emplace_back(Node->getOperand(OpNum++), Ty);
   }
 
   // Emit a call to the vector function.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b9e72c9..23102d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1371,7 +1371,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
                         const TargetLibraryInfo *LibraryInfo,
                         UniformityInfo *NewUA, ProfileSummaryInfo *PSIin,
                         BlockFrequencyInfo *BFIin, MachineModuleInfo &MMIin,
-                        FunctionVarLocs const *VarLocs, bool HasDivergency) {
+                        FunctionVarLocs const *VarLocs) {
   MF = &NewMF;
   SDAGISelPass = PassPtr;
   ORE = &NewORE;
@@ -1384,7 +1384,6 @@ void SelectionDAG::init(MachineFunction &NewMF,
   BFI = BFIin;
   MMI = &MMIin;
   FnVarLocs = VarLocs;
-  DivergentTarget = HasDivergency;
 }
 
 SelectionDAG::~SelectionDAG() {
@@ -2331,8 +2330,7 @@ SDValue SelectionDAG::getRegister(Register Reg, EVT VT) {
     return SDValue(E, 0);
 
   auto *N = newSDNode<RegisterSDNode>(Reg, VTs);
-  N->SDNodeBits.IsDivergent =
-      DivergentTarget && TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
+  N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -2578,18 +2576,12 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
   }
 
   TargetLowering::ArgListTy Args;
-  auto AddArgListEntry = [&](SDValue Node, Type *Ty) {
-    TargetLowering::ArgListEntry Entry{};
-    Entry.Ty = Ty;
-    Entry.Node = Node;
-    Args.push_back(Entry);
-  };
 
   // Pass the arguments.
   for (const SDValue &Op : Node->op_values()) {
     EVT ArgVT = Op.getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(Ctx);
-    AddArgListEntry(Op, ArgTy);
+    Args.emplace_back(Op, ArgTy);
   }
 
   // Pass the output pointers.
@@ -2601,7 +2593,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
     EVT ResVT = Node->getValueType(ResNo);
     SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(ResVT);
     ResultPtrs[ResNo] = ResultPtr;
-    AddArgListEntry(ResultPtr, PointerTy);
+    Args.emplace_back(ResultPtr, PointerTy);
   }
 
   SDLoc DL(Node);
@@ -2610,7 +2602,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
   if (VD && VD->isMasked()) {
     EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), Ctx, VT);
     SDValue Mask = getBoolConstant(true, DL, MaskVT, VT);
-    AddArgListEntry(Mask, MaskVT.getTypeForEVT(Ctx));
+    Args.emplace_back(Mask, MaskVT.getTypeForEVT(Ctx));
   }
 
   Type *RetType = CallRetResNo.has_value()
@@ -5462,6 +5454,83 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
     }
     return true;
 
+  case ISD::EXTRACT_SUBVECTOR: {
+    SDValue Src = Op.getOperand(0);
+    if (Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
+    return isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrcElts, PoisonOnly,
+                                            Depth + 1);
+  }
+
+  case ISD::INSERT_SUBVECTOR: {
+    if (Op.getValueType().isScalableVector())
+      break;
+    SDValue Src = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.clearBits(Idx, Idx + NumSubElts);
+
+    if (!!DemandedSubElts && !isGuaranteedNotToBeUndefOrPoison(
+                                 Sub, DemandedSubElts, PoisonOnly, Depth + 1))
+      return false;
+    if (!!DemandedSrcElts && !isGuaranteedNotToBeUndefOrPoison(
+                                 Src, DemandedSrcElts, PoisonOnly, Depth + 1))
+      return false;
+    return true;
+  }
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Src = Op.getOperand(0);
+    auto *IndexC = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.isFixedLengthVector() && IndexC &&
+        IndexC->getAPIntValue().ult(SrcVT.getVectorNumElements())) {
+      APInt DemandedSrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
+                                                  IndexC->getZExtValue());
+      return isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrcElts, PoisonOnly,
+                                              Depth + 1);
+    }
+    break;
+  }
+
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue InVec = Op.getOperand(0);
+    SDValue InVal = Op.getOperand(1);
+    SDValue EltNo = Op.getOperand(2);
+    EVT VT = InVec.getValueType();
+    auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
+    if (IndexC && VT.isFixedLengthVector() &&
+        IndexC->getAPIntValue().ult(VT.getVectorNumElements())) {
+      if (DemandedElts[IndexC->getZExtValue()] &&
+          !isGuaranteedNotToBeUndefOrPoison(InVal, PoisonOnly, Depth + 1))
+        return false;
+      APInt InVecDemandedElts = DemandedElts;
+      InVecDemandedElts.clearBit(IndexC->getZExtValue());
+      if (!!InVecDemandedElts &&
+          !isGuaranteedNotToBeUndefOrPoison(InVec, InVecDemandedElts,
+                                            PoisonOnly, Depth + 1))
+        return false;
+      return true;
+    }
+    break;
+  }
+
+  case ISD::SCALAR_TO_VECTOR:
+    // Check upper (known undef) elements.
+    if (DemandedElts.ugt(1) && !PoisonOnly)
+      return false;
+    // Check element zero.
+    if (DemandedElts[0] && !isGuaranteedNotToBeUndefOrPoison(
+                               Op.getOperand(0), PoisonOnly, Depth + 1))
+      return false;
+    return true;
+
   case ISD::SPLAT_VECTOR:
     return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly,
                                             Depth + 1);
@@ -5484,6 +5553,52 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
     return true;
   }
 
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
+    // Shift amount operand is checked by canCreateUndefOrPoison. So it is
+    // enough to check operand 0 if Op can't create undef/poison.
+    return !canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly,
+                                   /*ConsiderFlags*/ true, Depth) &&
+           isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedElts,
+                                            PoisonOnly, Depth + 1);
+
+  case ISD::BSWAP:
+  case ISD::CTPOP:
+  case ISD::BITREVERSE:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::VSELECT: {
+    // If Op can't create undef/poison and none of its operands are undef/poison
+    // then Op is never undef/poison. A difference from the more common check
+    // below, outside the switch, is that we handle elementwise operations for
+    // which the DemandedElts mask is valid for all operands here.
+    return !canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly,
+                                   /*ConsiderFlags*/ true, Depth) &&
+           all_of(Op->ops(), [&](SDValue V) {
+             return isGuaranteedNotToBeUndefOrPoison(V, DemandedElts,
+                                                     PoisonOnly, Depth + 1);
+           });
+  }
+
     // TODO: Search for noundef attributes from library functions.
 
     // TODO: Pointers dereferenced by ISD::LOAD/STORE ops are noundef.
@@ -5549,8 +5664,10 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::ABDS:
   case ISD::SMIN:
   case ISD::SMAX:
+  case ISD::SCMP:
   case ISD::UMIN:
   case ISD::UMAX:
+  case ISD::UCMP:
   case ISD::AND:
   case ISD::XOR:
   case ISD::ROTL:
@@ -5630,7 +5747,11 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::FDIV:
   case ISD::FREM:
   case ISD::FCOPYSIGN:
+  case ISD::FMA:
+  case ISD::FMAD:
   case ISD::FP_EXTEND:
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
     // No poison except from flags (which is handled above)
     return false;
 
@@ -8896,18 +9017,11 @@ SelectionDAG::getMemcmp(SDValue Chain, const SDLoc &dl, SDValue Mem0,
   if (!LibCallName)
     return {};
 
-  // Emit a library call.
-  auto GetEntry = [](Type *Ty, SDValue &SDV) {
-    TargetLowering::ArgListEntry E;
-    E.Ty = Ty;
-    E.Node = SDV;
-    return E;
-  };
-
   PointerType *PT = PointerType::getUnqual(*getContext());
   TargetLowering::ArgListTy Args = {
-      GetEntry(PT, Mem0), GetEntry(PT, Mem1),
-      GetEntry(getDataLayout().getIntPtrType(*getContext()), Size)};
+      {Mem0, PT},
+      {Mem1, PT},
+      {Size, getDataLayout().getIntPtrType(*getContext())}};
 
   TargetLowering::CallLoweringInfo CLI(*this);
   bool IsTailCall = false;
@@ -8978,13 +9092,10 @@ SDValue SelectionDAG::getMemcpy(
 
   // Emit a library call.
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Ty = PointerType::getUnqual(*getContext());
-  Entry.Node = Dst; Args.push_back(Entry);
-  Entry.Node = Src; Args.push_back(Entry);
-
-  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
-  Entry.Node = Size; Args.push_back(Entry);
+  Type *PtrTy = PointerType::getUnqual(*getContext());
+  Args.emplace_back(Dst, PtrTy);
+  Args.emplace_back(Src, PtrTy);
+  Args.emplace_back(Size, getDataLayout().getIntPtrType(*getContext()));
   // FIXME: pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
   bool IsTailCall = false;
@@ -9022,17 +9133,10 @@ SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
                                       MachinePointerInfo SrcPtrInfo) {
   // Emit a library call.
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
-  Entry.Node = Dst;
-  Args.push_back(Entry);
-
-  Entry.Node = Src;
-  Args.push_back(Entry);
-
-  Entry.Ty = SizeTy;
-  Entry.Node = Size;
-  Args.push_back(Entry);
+  Type *ArgTy = getDataLayout().getIntPtrType(*getContext());
+  Args.emplace_back(Dst, ArgTy);
+  Args.emplace_back(Src, ArgTy);
+  Args.emplace_back(Size, SizeTy);
 
   RTLIB::Libcall LibraryCall =
       RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElemSz);
@@ -9095,13 +9199,10 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
 
   // Emit a library call.
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Ty = PointerType::getUnqual(*getContext());
-  Entry.Node = Dst; Args.push_back(Entry);
-  Entry.Node = Src; Args.push_back(Entry);
-
-  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
-  Entry.Node = Size; Args.push_back(Entry);
+  Type *PtrTy = PointerType::getUnqual(*getContext());
+  Args.emplace_back(Dst, PtrTy);
+  Args.emplace_back(Src, PtrTy);
+  Args.emplace_back(Size, getDataLayout().getIntPtrType(*getContext()));
   // FIXME:  pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
 
@@ -9139,17 +9240,10 @@ SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
                                        MachinePointerInfo SrcPtrInfo) {
   // Emit a library call.
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
-  Entry.Node = Dst;
-  Args.push_back(Entry);
-
-  Entry.Node = Src;
-  Args.push_back(Entry);
-
-  Entry.Ty = SizeTy;
-  Entry.Node = Size;
-  Args.push_back(Entry);
+  Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext());
+  Args.emplace_back(Dst, IntPtrTy);
+  Args.emplace_back(Src, IntPtrTy);
+  Args.emplace_back(Size, SizeTy);
 
   RTLIB::Libcall LibraryCall =
       RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElemSz);
@@ -9226,28 +9320,20 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
 
   const char *BzeroName = getTargetLoweringInfo().getLibcallName(RTLIB::BZERO);
 
-  // Helper function to create an Entry from Node and Type.
-  const auto CreateEntry = [](SDValue Node, Type *Ty) {
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = Node;
-    Entry.Ty = Ty;
-    return Entry;
-  };
-
   bool UseBZero = isNullConstant(Src) && BzeroName;
   // If zeroing out and bzero is present, use it.
   if (UseBZero) {
     TargetLowering::ArgListTy Args;
-    Args.push_back(CreateEntry(Dst, PointerType::getUnqual(Ctx)));
-    Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx)));
+    Args.emplace_back(Dst, PointerType::getUnqual(Ctx));
+    Args.emplace_back(Size, DL.getIntPtrType(Ctx));
     CLI.setLibCallee(
         TLI->getLibcallCallingConv(RTLIB::BZERO), Type::getVoidTy(Ctx),
         getExternalSymbol(BzeroName, TLI->getPointerTy(DL)), std::move(Args));
   } else {
     TargetLowering::ArgListTy Args;
-    Args.push_back(CreateEntry(Dst, PointerType::getUnqual(Ctx)));
-    Args.push_back(CreateEntry(Src, Src.getValueType().getTypeForEVT(Ctx)));
-    Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx)));
+    Args.emplace_back(Dst, PointerType::getUnqual(Ctx));
+    Args.emplace_back(Src, Src.getValueType().getTypeForEVT(Ctx));
+    Args.emplace_back(Size, DL.getIntPtrType(Ctx));
     CLI.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
                      Dst.getValueType().getTypeForEVT(Ctx),
                      getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
@@ -9276,18 +9362,9 @@ SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl,
                                       MachinePointerInfo DstPtrInfo) {
   // Emit a library call.
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
-  Entry.Node = Dst;
-  Args.push_back(Entry);
-
-  Entry.Ty = Type::getInt8Ty(*getContext());
-  Entry.Node = Value;
-  Args.push_back(Entry);
-
-  Entry.Ty = SizeTy;
-  Entry.Node = Size;
-  Args.push_back(Entry);
+  Args.emplace_back(Dst, getDataLayout().getIntPtrType(*getContext()));
+  Args.emplace_back(Value, Type::getInt8Ty(*getContext()));
+  Args.emplace_back(Size, SizeTy);
 
   RTLIB::Libcall LibraryCall =
       RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElemSz);
@@ -12264,8 +12341,6 @@ static bool gluePropagatesDivergence(const SDNode *Node) {
 }
 
 bool SelectionDAG::calculateDivergence(SDNode *N) {
-  if (!DivergentTarget)
-    return false;
   if (TLI->isSDNodeAlwaysUniform(N)) {
     assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) &&
            "Conflicting divergence information!");
@@ -12285,8 +12360,6 @@ bool SelectionDAG::calculateDivergence(SDNode *N) {
 }
 
 void SelectionDAG::updateDivergence(SDNode *N) {
-  if (!DivergentTarget)
-    return;
   SmallVector<SDNode *, 16> Worklist(1, N);
   do {
     N = Worklist.pop_back_val();
@@ -13847,20 +13920,16 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
     Ops[I].setInitial(Vals[I]);
     EVT VT = Ops[I].getValueType();
 
-    // Take care of the Node's operands iff target has divergence
     // Skip Chain. It does not carry divergence.
-    if (DivergentTarget && VT != MVT::Other &&
+    if (VT != MVT::Other &&
         (VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) &&
         Ops[I].getNode()->isDivergent()) {
-      // Node is going to be divergent if at least one of its operand is
-      // divergent, unless it belongs to the "AlwaysUniform" exemptions.
       IsDivergent = true;
     }
   }
   Node->NumOperands = Vals.size();
   Node->OperandList = Ops;
-  // Check the divergence of the Node itself.
-  if (DivergentTarget && !TLI->isSDNodeAlwaysUniform(Node)) {
+  if (!TLI->isSDNodeAlwaysUniform(Node)) {
     IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, UA);
     Node->SDNodeBits.IsDivergent = IsDivergent;
   }
@@ -13950,10 +14019,7 @@ SDValue SelectionDAG::makeStateFunctionCall(unsigned LibFunc, SDValue Ptr,
                                             const SDLoc &DLoc) {
   assert(InChain.getValueType() == MVT::Other && "Expected token chain");
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Node = Ptr;
-  Entry.Ty = Ptr.getValueType().getTypeForEVT(*getContext());
-  Args.push_back(Entry);
+  Args.emplace_back(Ptr, Ptr.getValueType().getTypeForEVT(*getContext()));
   RTLIB::Libcall LC = static_cast<RTLIB::Libcall>(LibFunc);
   SDValue Callee = getExternalSymbol(TLI->getLibcallName(LC),
                                      TLI->getPointerTy(getDataLayout()));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f5f5c14..901f10d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1837,11 +1837,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
                          getValue(CPA->getDiscriminator()));
     }
 
-    if (isa<ConstantPointerNull>(C)) {
-      unsigned AS = V->getType()->getPointerAddressSpace();
-      return DAG.getConstant(0, getCurSDLoc(),
-                             TLI.getPointerTy(DAG.getDataLayout(), AS));
-    }
+    if (isa<ConstantPointerNull>(C))
+      return DAG.getConstant(0, getCurSDLoc(), VT);
 
     if (match(C, m_VScale()))
       return DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1));
@@ -2211,9 +2208,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
                         MVT::Other, Chains);
   } else if (I.getNumOperands() != 0) {
-    SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
-    unsigned NumValues = ValueVTs.size();
+    SmallVector<Type *, 4> Types;
+    ComputeValueTypes(DL, I.getOperand(0)->getType(), Types);
+    unsigned NumValues = Types.size();
     if (NumValues) {
       SDValue RetOp = getValue(I.getOperand(0));
 
@@ -2233,7 +2230,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
       bool RetInReg = F->getAttributes().hasRetAttr(Attribute::InReg);
 
       for (unsigned j = 0; j != NumValues; ++j) {
-        EVT VT = ValueVTs[j];
+        EVT VT = TLI.getValueType(DL, Types[j]);
 
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
           VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
@@ -2273,8 +2270,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
           Flags.setNoExt();
 
         for (unsigned i = 0; i < NumParts; ++i) {
-          Outs.push_back(ISD::OutputArg(
-              Flags, Parts[i].getValueType().getSimpleVT(), VT, 0, 0));
+          Outs.push_back(ISD::OutputArg(Flags,
+                                        Parts[i].getValueType().getSimpleVT(),
+                                        VT, Types[j], 0, 0));
           OutVals.push_back(Parts[i]);
         }
       }
@@ -2292,6 +2290,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     Flags.setSwiftError();
     Outs.push_back(ISD::OutputArg(Flags, /*vt=*/TLI.getPointerTy(DL),
                                   /*argvt=*/EVT(TLI.getPointerTy(DL)),
+                                  PointerType::getUnqual(*DAG.getContext()),
                                   /*origidx=*/1, /*partOffs=*/0));
     // Create SDNode for the swifterror virtual register.
     OutVals.push_back(
@@ -3107,9 +3106,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
     assert(FnTy->getNumParams() == 1 && "Invalid function signature");
 
     TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = GuardVal;
-    Entry.Ty = FnTy->getParamType(0);
+    TargetLowering::ArgListEntry Entry(GuardVal, FnTy->getParamType(0));
     if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg))
       Entry.IsInReg = true;
     Args.push_back(Entry);
@@ -3206,9 +3203,7 @@ void SelectionDAGBuilder::visitSPDescriptorFailure(
     assert(FnTy->getNumParams() == 1 && "Invalid function signature");
 
     TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = GuardVal;
-    Entry.Ty = FnTy->getParamType(0);
+    TargetLowering::ArgListEntry Entry(GuardVal, FnTy->getParamType(0));
     if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg))
       Entry.IsInReg = true;
     Args.push_back(Entry);
@@ -3578,7 +3573,7 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
   MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
 
   // Update machine-CFG edges with unique successors.
-  SmallSet<BasicBlock*, 32> Done;
+  SmallPtrSet<BasicBlock *, 32> Done;
   for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
     BasicBlock *BB = I.getSuccessor(i);
     bool Inserted = Done.insert(BB).second;
@@ -3977,6 +3972,11 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) {
   setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
+void SelectionDAGBuilder::visitPtrToAddr(const User &I) {
+  // FIXME: this is not correct for pointers with addr width != pointer width
+  visitPtrToInt(I);
+}
+
 void SelectionDAGBuilder::visitPtrToInt(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
@@ -4902,9 +4902,8 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 // extract the splat value and use it as a uniform base.
 // In all other cases the function returns 'false'.
 static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
-                           ISD::MemIndexType &IndexType, SDValue &Scale,
-                           SelectionDAGBuilder *SDB, const BasicBlock *CurBB,
-                           uint64_t ElemSize) {
+                           SDValue &Scale, SelectionDAGBuilder *SDB,
+                           const BasicBlock *CurBB, uint64_t ElemSize) {
   SelectionDAG& DAG = SDB->DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const DataLayout &DL = DAG.getDataLayout();
@@ -4922,7 +4921,6 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
     ElementCount NumElts = cast<VectorType>(Ptr->getType())->getElementCount();
     EVT VT = EVT::getVectorVT(*DAG.getContext(), TLI.getPointerTy(DL), NumElts);
     Index = DAG.getConstant(0, SDB->getCurSDLoc(), VT);
-    IndexType = ISD::SIGNED_SCALED;
     Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
     return true;
   }
@@ -4952,7 +4950,6 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
 
   Base = SDB->getValue(BasePtr);
   Index = SDB->getValue(IndexVal);
-  IndexType = ISD::SIGNED_SCALED;
 
   Scale =
       DAG.getTargetConstant(ScaleVal, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
@@ -4974,9 +4971,8 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 
   SDValue Base;
   SDValue Index;
-  ISD::MemIndexType IndexType;
   SDValue Scale;
-  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+  bool UniformBase = getUniformBase(Ptr, Base, Index, Scale, this,
                                     I.getParent(), VT.getScalarStoreSize());
 
   unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
@@ -4986,8 +4982,8 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_SCALED;
-    Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    Scale =
+        DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
 
   EVT IdxVT = Index.getValueType();
@@ -4999,7 +4995,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 
   SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale };
   SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
-                                         Ops, MMO, IndexType, false);
+                                         Ops, MMO, ISD::SIGNED_SCALED, false);
   DAG.setRoot(Scatter);
   setValue(&I, Scatter);
 }
@@ -5092,9 +5088,8 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Root = DAG.getRoot();
   SDValue Base;
   SDValue Index;
-  ISD::MemIndexType IndexType;
   SDValue Scale;
-  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+  bool UniformBase = getUniformBase(Ptr, Base, Index, Scale, this,
                                     I.getParent(), VT.getScalarStoreSize());
   unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
@@ -5105,8 +5100,8 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_SCALED;
-    Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    Scale =
+        DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
 
   EVT IdxVT = Index.getValueType();
@@ -5117,8 +5112,9 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   }
 
   SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
-  SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
-                                       Ops, MMO, IndexType, ISD::NON_EXTLOAD);
+  SDValue Gather =
+      DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl, Ops, MMO,
+                          ISD::SIGNED_SCALED, ISD::NON_EXTLOAD);
 
   PendingLoads.push_back(Gather.getValue(1));
   setValue(&I, Gather);
@@ -6431,9 +6427,8 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
   SDValue Root = DAG.getRoot();
   SDValue Base;
   SDValue Index;
-  ISD::MemIndexType IndexType;
   SDValue Scale;
-  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+  bool UniformBase = getUniformBase(Ptr, Base, Index, Scale, this,
                                     I.getParent(), VT.getScalarStoreSize());
 
   unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
@@ -6446,7 +6441,6 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_SCALED;
     Scale =
         DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
@@ -6462,7 +6456,7 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
 
   SDValue Ops[] = {Root, Inc, Mask, Base, Index, Scale, ID};
   SDValue Histogram = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), VT, sdl,
-                                             Ops, MMO, IndexType);
+                                             Ops, MMO, ISD::SIGNED_SCALED);
 
   setValue(&I, Histogram);
   DAG.setRoot(Histogram);
@@ -7514,10 +7508,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     }
     TargetLowering::ArgListTy Args;
     if (Intrinsic == Intrinsic::ubsantrap) {
-      Args.push_back(TargetLoweringBase::ArgListEntry());
-      Args[0].Val = I.getArgOperand(0);
-      Args[0].Node = getValue(Args[0].Val);
-      Args[0].Ty = Args[0].Val->getType();
+      Value *Arg = I.getArgOperand(0);
+      Args.emplace_back(Arg, getValue(Arg));
     }
 
     TargetLowering::CallLoweringInfo CLI(DAG);
@@ -7597,7 +7589,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     if (TM.getOptLevel() == CodeGenOptLevel::None)
       return;
 
-    const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(1));
+    const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(0));
     if (!LifetimeObject)
       return;
 
@@ -7946,9 +7938,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     Args.reserve(3);
 
     for (unsigned Idx : {2, 3, 1}) {
-      TargetLowering::ArgListEntry Arg;
-      Arg.Node = getValue(I.getOperand(Idx));
-      Arg.Ty = I.getOperand(Idx)->getType();
+      TargetLowering::ArgListEntry Arg(getValue(I.getOperand(Idx)),
+                                       I.getOperand(Idx)->getType());
       Arg.setAttributes(&I, Idx);
       Args.push_back(Arg);
     }
@@ -7959,9 +7950,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
 
     // Forward the flags and any additional arguments.
     for (unsigned Idx = 4; Idx < I.arg_size(); ++Idx) {
-      TargetLowering::ArgListEntry Arg;
-      Arg.Node = getValue(I.getOperand(Idx));
-      Arg.Ty = I.getOperand(Idx)->getType();
+      TargetLowering::ArgListEntry Arg(getValue(I.getOperand(Idx)),
+                                       I.getOperand(Idx)->getType());
       Arg.setAttributes(&I, Idx);
       Args.push_back(Arg);
     }
@@ -7983,6 +7973,42 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     HasTailCall = true;
     return;
   }
+  case Intrinsic::amdgcn_call_whole_wave: {
+    TargetLowering::ArgListTy Args;
+
+    // The first argument is the callee. Skip it when assembling the call args.
+    for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
+      TargetLowering::ArgListEntry Arg(getValue(I.getArgOperand(Idx)),
+                                       I.getArgOperand(Idx)->getType());
+      Arg.setAttributes(&I, Idx);
+      Args.push_back(Arg);
+    }
+
+    SDValue ConvControlToken;
+    if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+      auto *Token = Bundle->Inputs[0].get();
+      ConvControlToken = getValue(Token);
+    }
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+        .setChain(getRoot())
+        .setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
+                   getValue(I.getArgOperand(0)), std::move(Args))
+        .setTailCall(false)
+        .setIsPreallocated(
+            I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
+        .setConvergent(I.isConvergent())
+        .setConvergenceControlToken(ConvControlToken);
+    CLI.CB = &I;
+
+    std::pair<SDValue, SDValue> Result =
+        lowerInvokable(CLI, /*EHPadBB=*/nullptr);
+
+    if (Result.first.getNode())
+      setValue(&I, Result.first);
+    return;
+  }
   case Intrinsic::ptrmask: {
     SDValue Ptr = getValue(I.getOperand(0));
     SDValue Mask = getValue(I.getOperand(1));
@@ -8487,14 +8513,12 @@ void SelectionDAGBuilder::visitVPGather(
       MachinePointerInfo(AS), MachineMemOperand::MOLoad,
       LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
   SDValue Base, Index, Scale;
-  ISD::MemIndexType IndexType;
-  bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
-                                    this, VPIntrin.getParent(),
-                                    VT.getScalarStoreSize());
+  bool UniformBase =
+      getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(),
+                     VT.getScalarStoreSize());
   if (!UniformBase) {
     Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(PtrOperand);
-    IndexType = ISD::SIGNED_SCALED;
     Scale = DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
   }
   EVT IdxVT = Index.getValueType();
@@ -8506,7 +8530,7 @@ void SelectionDAGBuilder::visitVPGather(
   LD = DAG.getGatherVP(
       DAG.getVTList(VT, MVT::Other), VT, DL,
       {DAG.getRoot(), Base, Index, Scale, OpValues[1], OpValues[2]}, MMO,
-      IndexType);
+      ISD::SIGNED_SCALED);
   PendingLoads.push_back(LD.getValue(1));
   setValue(&VPIntrin, LD);
 }
@@ -8550,16 +8574,13 @@ void SelectionDAGBuilder::visitVPScatter(
       MachinePointerInfo(AS), MachineMemOperand::MOStore,
       LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
   SDValue Base, Index, Scale;
-  ISD::MemIndexType IndexType;
-  bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
-                                    this, VPIntrin.getParent(),
-                                    VT.getScalarStoreSize());
+  bool UniformBase =
+      getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(),
+                     VT.getScalarStoreSize());
   if (!UniformBase) {
     Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(PtrOperand);
-    IndexType = ISD::SIGNED_SCALED;
-    Scale =
-      DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
+    Scale = DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
   }
   EVT IdxVT = Index.getValueType();
   EVT EltTy = IdxVT.getVectorElementType();
@@ -8570,7 +8591,7 @@ void SelectionDAGBuilder::visitVPScatter(
   ST = DAG.getScatterVP(DAG.getVTList(MVT::Other), VT, DL,
                         {getMemoryRoot(), OpValues[0], Base, Index, Scale,
                          OpValues[2], OpValues[3]},
-                        MMO, IndexType);
+                        MMO, ISD::SIGNED_SCALED);
   DAG.setRoot(ST);
   setValue(&VPIntrin, ST);
 }
@@ -8912,7 +8933,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
   }
 
   for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
-    TargetLowering::ArgListEntry Entry;
     const Value *V = *I;
 
     // Skip empty types
@@ -8920,8 +8940,7 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
       continue;
 
     SDValue ArgNode = getValue(V);
-    Entry.Node = ArgNode; Entry.Ty = V->getType();
-
+    TargetLowering::ArgListEntry Entry(ArgNode, V->getType());
     Entry.setAttributes(&CB, I - CB.arg_begin());
 
     // Use swifterror virtual register as input to the call.
@@ -8945,11 +8964,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
   // If call site has a cfguardtarget operand bundle, create and add an
   // additional ArgListEntry.
   if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_cfguardtarget)) {
-    TargetLowering::ArgListEntry Entry;
     Value *V = Bundle->Inputs[0];
-    SDValue ArgNode = getValue(V);
-    Entry.Node = ArgNode;
-    Entry.Ty = V->getType();
+    TargetLowering::ArgListEntry Entry(V, getValue(V));
     Entry.IsCFGuardTarget = true;
     Args.push_back(Entry);
   }
@@ -10612,9 +10628,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
 
     assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
 
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = getValue(V);
-    Entry.Ty = V->getType();
+    TargetLowering::ArgListEntry Entry(getValue(V), V->getType());
     Entry.setAttributes(Call, ArgI);
     Args.push_back(Entry);
   }
@@ -10974,27 +10988,42 @@ static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
 /// migrated to using LowerCall, this hook should be integrated into SDISel.
 std::pair<SDValue, SDValue>
 TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
+  LLVMContext &Context = CLI.RetTy->getContext();
+
   // Handle the incoming return values from the call.
   CLI.Ins.clear();
-  SmallVector<EVT, 4> RetTys;
+  SmallVector<Type *, 4> RetOrigTys;
   SmallVector<TypeSize, 4> Offsets;
   auto &DL = CLI.DAG.getDataLayout();
-  ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);
+  ComputeValueTypes(DL, CLI.OrigRetTy, RetOrigTys, &Offsets);
+
+  SmallVector<EVT, 4> RetVTs;
+  if (CLI.RetTy != CLI.OrigRetTy) {
+    assert(RetOrigTys.size() == 1 &&
+           "Only supported for non-aggregate returns");
+    RetVTs.push_back(getValueType(DL, CLI.RetTy));
+  } else {
+    for (Type *Ty : RetOrigTys)
+      RetVTs.push_back(getValueType(DL, Ty));
+  }
 
   if (CLI.IsPostTypeLegalization) {
     // If we are lowering a libcall after legalization, split the return type.
-    SmallVector<EVT, 4> OldRetTys;
+    SmallVector<Type *, 4> OldRetOrigTys;
+    SmallVector<EVT, 4> OldRetVTs;
     SmallVector<TypeSize, 4> OldOffsets;
-    RetTys.swap(OldRetTys);
+    RetOrigTys.swap(OldRetOrigTys);
+    RetVTs.swap(OldRetVTs);
     Offsets.swap(OldOffsets);
 
-    for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) {
-      EVT RetVT = OldRetTys[i];
+    for (size_t i = 0, e = OldRetVTs.size(); i != e; ++i) {
+      EVT RetVT = OldRetVTs[i];
       uint64_t Offset = OldOffsets[i];
-      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT);
-      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT);
+      MVT RegisterVT = getRegisterType(Context, RetVT);
+      unsigned NumRegs = getNumRegisters(Context, RetVT);
       unsigned RegisterVTByteSZ = RegisterVT.getSizeInBits() / 8;
-      RetTys.append(NumRegs, RegisterVT);
+      RetOrigTys.append(NumRegs, OldRetOrigTys[i]);
+      RetVTs.append(NumRegs, RegisterVT);
       for (unsigned j = 0; j != NumRegs; ++j)
         Offsets.push_back(TypeSize::getFixed(Offset + j * RegisterVTByteSZ));
     }
@@ -11005,7 +11034,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
 
   bool CanLowerReturn =
       this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
-                           CLI.IsVarArg, Outs, CLI.RetTy->getContext(), CLI.RetTy);
+                           CLI.IsVarArg, Outs, Context, CLI.RetTy);
 
   SDValue DemoteStackSlot;
   int DemoteStackIdx = -100;
@@ -11018,30 +11047,16 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     MachineFunction &MF = CLI.DAG.getMachineFunction();
     DemoteStackIdx =
         MF.getFrameInfo().CreateStackObject(TySize, Alignment, false);
-    Type *StackSlotPtrType =
-        PointerType::get(CLI.RetTy->getContext(), DL.getAllocaAddrSpace());
+    Type *StackSlotPtrType = PointerType::get(Context, DL.getAllocaAddrSpace());
 
     DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL));
-    ArgListEntry Entry;
-    Entry.Node = DemoteStackSlot;
-    Entry.Ty = StackSlotPtrType;
-    Entry.IsSExt = false;
-    Entry.IsZExt = false;
-    Entry.IsInReg = false;
+    ArgListEntry Entry(DemoteStackSlot, StackSlotPtrType);
     Entry.IsSRet = true;
-    Entry.IsNest = false;
-    Entry.IsByVal = false;
-    Entry.IsByRef = false;
-    Entry.IsReturned = false;
-    Entry.IsSwiftSelf = false;
-    Entry.IsSwiftAsync = false;
-    Entry.IsSwiftError = false;
-    Entry.IsCFGuardTarget = false;
     Entry.Alignment = Alignment;
     CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
     CLI.NumFixedArgs += 1;
     CLI.getArgs()[0].IndirectType = CLI.RetTy;
-    CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
+    CLI.RetTy = CLI.OrigRetTy = Type::getVoidTy(Context);
 
     // sret demotion isn't compatible with tail-calls, since the sret argument
     // points into the callers stack frame.
@@ -11049,36 +11064,32 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   } else {
     bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
         CLI.RetTy, CLI.CallConv, CLI.IsVarArg, DL);
-    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+    for (unsigned I = 0, E = RetVTs.size(); I != E; ++I) {
       ISD::ArgFlagsTy Flags;
       if (NeedsRegBlock) {
         Flags.setInConsecutiveRegs();
-        if (I == RetTys.size() - 1)
+        if (I == RetVTs.size() - 1)
           Flags.setInConsecutiveRegsLast();
       }
-      EVT VT = RetTys[I];
-      MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
-                                                     CLI.CallConv, VT);
-      unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
-                                                       CLI.CallConv, VT);
+      EVT VT = RetVTs[I];
+      MVT RegisterVT = getRegisterTypeForCallingConv(Context, CLI.CallConv, VT);
+      unsigned NumRegs =
+          getNumRegistersForCallingConv(Context, CLI.CallConv, VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
-        ISD::InputArg MyFlags;
-        MyFlags.Flags = Flags;
-        MyFlags.VT = RegisterVT;
-        MyFlags.ArgVT = VT;
-        MyFlags.Used = CLI.IsReturnValueUsed;
+        ISD::InputArg Ret(Flags, RegisterVT, VT, RetOrigTys[I],
+                          CLI.IsReturnValueUsed, ISD::InputArg::NoArgIndex, 0);
         if (CLI.RetTy->isPointerTy()) {
-          MyFlags.Flags.setPointer();
-          MyFlags.Flags.setPointerAddrSpace(
+          Ret.Flags.setPointer();
+          Ret.Flags.setPointerAddrSpace(
               cast<PointerType>(CLI.RetTy)->getAddressSpace());
         }
         if (CLI.RetSExt)
-          MyFlags.Flags.setSExt();
+          Ret.Flags.setSExt();
         if (CLI.RetZExt)
-          MyFlags.Flags.setZExt();
+          Ret.Flags.setZExt();
         if (CLI.IsInReg)
-          MyFlags.Flags.setInReg();
-        CLI.Ins.push_back(MyFlags);
+          Ret.Flags.setInReg();
+        CLI.Ins.push_back(Ret);
       }
     }
   }
@@ -11088,11 +11099,12 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   if (supportSwiftError()) {
     for (const ArgListEntry &Arg : Args) {
       if (Arg.IsSwiftError) {
-        ISD::InputArg MyFlags;
-        MyFlags.VT = getPointerTy(DL);
-        MyFlags.ArgVT = EVT(getPointerTy(DL));
-        MyFlags.Flags.setSwiftError();
-        CLI.Ins.push_back(MyFlags);
+        ISD::ArgFlagsTy Flags;
+        Flags.setSwiftError();
+        ISD::InputArg Ret(Flags, getPointerTy(DL), EVT(getPointerTy(DL)),
+                          PointerType::getUnqual(Context),
+                          /*Used=*/true, ISD::InputArg::NoArgIndex, 0);
+        CLI.Ins.push_back(Ret);
       }
     }
   }
@@ -11101,18 +11113,24 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   CLI.Outs.clear();
   CLI.OutVals.clear();
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-    SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
+    SmallVector<Type *, 4> OrigArgTys;
+    ComputeValueTypes(DL, Args[i].OrigTy, OrigArgTys);
     // FIXME: Split arguments if CLI.IsPostTypeLegalization
     Type *FinalType = Args[i].Ty;
     if (Args[i].IsByVal)
       FinalType = Args[i].IndirectType;
     bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
         FinalType, CLI.CallConv, CLI.IsVarArg, DL);
-    for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
+    for (unsigned Value = 0, NumValues = OrigArgTys.size(); Value != NumValues;
          ++Value) {
-      EVT VT = ValueVTs[Value];
-      Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
+      Type *OrigArgTy = OrigArgTys[Value];
+      Type *ArgTy = OrigArgTy;
+      if (Args[i].Ty != Args[i].OrigTy) {
+        assert(Value == 0 && "Only supported for non-aggregate arguments");
+        ArgTy = Args[i].Ty;
+      }
+
+      EVT VT = getValueType(DL, ArgTy);
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
@@ -11125,10 +11143,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
 
       if (i >= CLI.NumFixedArgs)
         Flags.setVarArg();
-      if (Args[i].Ty->isPointerTy()) {
+      if (ArgTy->isPointerTy()) {
         Flags.setPointer();
-        Flags.setPointerAddrSpace(
-            cast<PointerType>(Args[i].Ty)->getAddressSpace());
+        Flags.setPointerAddrSpace(cast<PointerType>(ArgTy)->getAddressSpace());
       }
       if (Args[i].IsZExt)
         Flags.setZExt();
@@ -11202,10 +11219,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
 
-      MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
-                                                 CLI.CallConv, VT);
-      unsigned NumParts = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
-                                                        CLI.CallConv, VT);
+      MVT PartVT = getRegisterTypeForCallingConv(Context, CLI.CallConv, VT);
+      unsigned NumParts =
+          getNumRegistersForCallingConv(Context, CLI.CallConv, VT);
       SmallVector<SDValue, 4> Parts(NumParts);
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
@@ -11222,7 +11238,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
                 (CLI.RetTy->isPointerTy() && Args[i].Ty->isPointerTy() &&
                  CLI.RetTy->getPointerAddressSpace() ==
                      Args[i].Ty->getPointerAddressSpace())) &&
-               RetTys.size() == NumValues && "unexpected use of 'returned'");
+               RetVTs.size() == NumValues && "unexpected use of 'returned'");
         // Before passing 'returned' to the target lowering code, ensure that
         // either the register MVT and the actual EVT are the same size or that
         // the return value and argument are extended in the same way; in these
@@ -11247,7 +11263,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // For scalable vectors the scalable part is currently handled
         // by individual targets, so we just use the known minimum size here.
         ISD::OutputArg MyFlags(
-            Flags, Parts[j].getValueType().getSimpleVT(), VT, i,
+            Flags, Parts[j].getValueType().getSimpleVT(), VT, OrigArgTy, i,
             j * Parts[j].getValueType().getStoreSize().getKnownMinValue());
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
@@ -11303,7 +11319,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     // hidden sret parameter.
     MVT PtrVT = getPointerTy(DL, DL.getAllocaAddrSpace());
 
-    unsigned NumValues = RetTys.size();
+    unsigned NumValues = RetVTs.size();
     ReturnValues.resize(NumValues);
     SmallVector<SDValue, 4> Chains(NumValues);
 
@@ -11316,7 +11332,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
           DemoteStackSlot, CLI.DAG.getConstant(Offsets[i], CLI.DL, PtrVT),
           CLI.DL, SDNodeFlags::NoUnsignedWrap);
       SDValue L = CLI.DAG.getLoad(
-          RetTys[i], CLI.DL, CLI.Chain, Add,
+          RetVTs[i], CLI.DL, CLI.Chain, Add,
           MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
                                             DemoteStackIdx, Offsets[i]),
           HiddenSRetAlign);
@@ -11334,11 +11350,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     else if (CLI.RetZExt)
       AssertOp = ISD::AssertZext;
     unsigned CurReg = 0;
-    for (EVT VT : RetTys) {
-      MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
-                                                     CLI.CallConv, VT);
-      unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
-                                                       CLI.CallConv, VT);
+    for (EVT VT : RetVTs) {
+      MVT RegisterVT = getRegisterTypeForCallingConv(Context, CLI.CallConv, VT);
+      unsigned NumRegs =
+          getNumRegistersForCallingConv(Context, CLI.CallConv, VT);
 
       ReturnValues.push_back(getCopyFromParts(
           CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, nullptr,
@@ -11354,7 +11369,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   }
 
   SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
-                                CLI.DAG.getVTList(RetTys), ReturnValues);
+                                CLI.DAG.getVTList(RetVTs), ReturnValues);
   return std::make_pair(Res, CLI.Chain);
 }
 
@@ -11625,7 +11640,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     ISD::ArgFlagsTy Flags;
     Flags.setSRet();
     MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVT);
-    ISD::InputArg RetArg(Flags, RegisterVT, ValueVT, true,
+    ISD::InputArg RetArg(Flags, RegisterVT, ValueVT, F.getReturnType(), true,
                          ISD::InputArg::NoArgIndex, 0);
     Ins.push_back(RetArg);
   }
@@ -11640,8 +11655,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
   // Set up the incoming argument description vector.
   for (const Argument &Arg : F.args()) {
     unsigned ArgNo = Arg.getArgNo();
-    SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
+    SmallVector<Type *, 4> Types;
+    ComputeValueTypes(DAG.getDataLayout(), Arg.getType(), Types);
     bool isArgValueUsed = !Arg.use_empty();
     unsigned PartBase = 0;
     Type *FinalType = Arg.getType();
@@ -11649,17 +11664,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       FinalType = Arg.getParamByValType();
     bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
         FinalType, F.getCallingConv(), F.isVarArg(), DL);
-    for (unsigned Value = 0, NumValues = ValueVTs.size();
-         Value != NumValues; ++Value) {
-      EVT VT = ValueVTs[Value];
-      Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+    for (unsigned Value = 0, NumValues = Types.size(); Value != NumValues;
+         ++Value) {
+      Type *ArgTy = Types[Value];
+      EVT VT = TLI->getValueType(DL, ArgTy);
       ISD::ArgFlagsTy Flags;
 
-
-      if (Arg.getType()->isPointerTy()) {
+      if (ArgTy->isPointerTy()) {
         Flags.setPointer();
-        Flags.setPointerAddrSpace(
-            cast<PointerType>(Arg.getType())->getAddressSpace());
+        Flags.setPointerAddrSpace(cast<PointerType>(ArgTy)->getAddressSpace());
       }
       if (Arg.hasAttribute(Attribute::ZExt))
         Flags.setZExt();
@@ -11763,7 +11776,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // are responsible for handling scalable vector arguments and
         // return values.
         ISD::InputArg MyFlags(
-            Flags, RegisterVT, VT, isArgValueUsed, ArgNo,
+            Flags, RegisterVT, VT, ArgTy, isArgValueUsed, ArgNo,
             PartBase + i * RegisterVT.getStoreSize().getKnownMinValue());
         if (NumRegs > 1 && i == 0)
           MyFlags.Flags.setSplit();
@@ -12737,17 +12750,22 @@ static Register FollowCopyChain(MachineRegisterInfo &MRI, Register Reg) {
   assert(MI->getOpcode() == TargetOpcode::COPY &&
          "start of copy chain MUST be COPY");
   Reg = MI->getOperand(1).getReg();
+
+  // If the copied register in the first copy must be virtual.
+  assert(Reg.isVirtual() && "expected COPY of virtual register");
   MI = MRI.def_begin(Reg)->getParent();
+
   // There may be an optional second copy.
   if (MI->getOpcode() == TargetOpcode::COPY) {
     assert(Reg.isVirtual() && "expected COPY of virtual register");
     Reg = MI->getOperand(1).getReg();
     assert(Reg.isPhysical() && "expected COPY of physical register");
-    MI = MRI.def_begin(Reg)->getParent();
+  } else {
+    // The start of the chain must be an INLINEASM_BR.
+    assert(MI->getOpcode() == TargetOpcode::INLINEASM_BR &&
+           "end of copy chain MUST be INLINEASM_BR");
   }
-  // The start of the chain must be an INLINEASM_BR.
-  assert(MI->getOpcode() == TargetOpcode::INLINEASM_BR &&
-         "end of copy chain MUST be INLINEASM_BR");
+
   return Reg;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index c251755..e0835e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -574,6 +574,7 @@ private:
   void visitFPToSI(const User &I);
   void visitUIToFP(const User &I);
   void visitSIToFP(const User &I);
+  void visitPtrToAddr(const User &I);
   void visitPtrToInt(const User &I);
   void visitIntToPtr(const User &I);
   void visitBitCast(const User &I);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 26071ed..ece50ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -480,10 +480,7 @@ void SelectionDAGISel::initializeAnalysisResults(
   MachineModuleInfo &MMI =
       MAMP.getCachedResult<MachineModuleAnalysis>(*Fn.getParent())->getMMI();
 
-  TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
-
-  CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
-               TTI->hasBranchDivergence(&Fn));
+  CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
 
   // Now get the optional analyzes if we want to.
   // This is based on the possibly changed OptLevel (after optnone is taken
@@ -501,6 +498,10 @@ void SelectionDAGISel::initializeAnalysisResults(
     BatchAA = std::nullopt;
 
   SP = &FAM.getResult<SSPLayoutAnalysis>(Fn);
+
+#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
+  TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
+#endif
 }
 
 void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
@@ -536,10 +537,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
   MachineModuleInfo &MMI =
       MFP.getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
 
-  TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
-
-  CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
-               TTI->hasBranchDivergence(&Fn));
+  CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
 
   // Now get the optional analyzes if we want to.
   // This is based on the possibly changed OptLevel (after optnone is taken
@@ -558,6 +556,10 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
     BatchAA = std::nullopt;
 
   SP = &MFP.getAnalysis<StackProtector>().getLayoutInfo();
+
+#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
+  TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
+#endif
 }
 
 bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 80aeefe..46a5e44 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -1258,7 +1258,7 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
 
   if (Record.type == RecordType::Spill) {
     unsigned Index = Record.payload.FI;
-    SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy());
+    SDValue SpillSlot = DAG.getFrameIndex(Index, getFrameIndexTy());
 
     // All the reloads are independent and are reading memory only modified by
     // statepoints (i.e. no other aliasing stores); informing SelectionDAG of
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e235d14..402a012 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -162,14 +162,17 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
   TargetLowering::ArgListTy Args;
   Args.reserve(Ops.size());
 
-  TargetLowering::ArgListEntry Entry;
   ArrayRef<Type *> OpsTypeOverrides = CallOptions.OpsTypeOverrides;
   for (unsigned i = 0; i < Ops.size(); ++i) {
     SDValue NewOp = Ops[i];
-    Entry.Node = NewOp;
-    Entry.Ty = i < OpsTypeOverrides.size() && OpsTypeOverrides[i]
+    Type *Ty = i < OpsTypeOverrides.size() && OpsTypeOverrides[i]
                    ? OpsTypeOverrides[i]
-                   : Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+                   : NewOp.getValueType().getTypeForEVT(*DAG.getContext());
+    TargetLowering::ArgListEntry Entry(NewOp, Ty);
+    if (CallOptions.IsSoften)
+      Entry.OrigTy =
+          CallOptions.OpsVTBeforeSoften[i].getTypeForEVT(*DAG.getContext());
+
     Entry.IsSExt =
         shouldSignExtendTypeInLibCall(Entry.Ty, CallOptions.IsSigned);
     Entry.IsZExt = !Entry.IsSExt;
@@ -189,18 +192,21 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  Type *OrigRetTy = RetTy;
   TargetLowering::CallLoweringInfo CLI(DAG);
   bool signExtend = shouldSignExtendTypeInLibCall(RetTy, CallOptions.IsSigned);
   bool zeroExtend = !signExtend;
 
-  if (CallOptions.IsSoften &&
-      !shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften)) {
-    signExtend = zeroExtend = false;
+  if (CallOptions.IsSoften) {
+    OrigRetTy = CallOptions.RetVTBeforeSoften.getTypeForEVT(*DAG.getContext());
+    if (!shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften))
+      signExtend = zeroExtend = false;
   }
 
   CLI.setDebugLoc(dl)
       .setChain(InChain)
-      .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+      .setLibCallee(getLibcallCallingConv(LC), RetTy, OrigRetTy, Callee,
+                    std::move(Args))
       .setNoReturn(CallOptions.DoesNotReturn)
       .setDiscardResult(!CallOptions.IsReturnValueUsed)
       .setIsPostTypeLegalization(CallOptions.IsPostTypeLegalization)
@@ -420,7 +426,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpsVT[2] = { OldLHS.getValueType(),
                    OldRHS.getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, RetVT, true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, RetVT);
   auto Call = makeLibCall(DAG, LC1, RetVT, Ops, CallOptions, dl, Chain);
   NewLHS = Call.first;
   NewRHS = DAG.getConstant(0, dl, RetVT);
@@ -5125,10 +5131,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           !ISD::isUnsignedIntSetCC(Cond))) &&
         isTypeDesirableForOp(ISD::SETCC, N0.getOperand(0).getValueType())) {
       EVT NewVT = N0.getOperand(0).getValueType();
-      SDValue NewConst = DAG.getConstant(ISD::isSignedIntSetCC(Cond)
-                                             ? C1.sext(NewVT.getSizeInBits())
-                                             : C1.zext(NewVT.getSizeInBits()),
-                                         dl, NewVT);
+      SDValue NewConst = DAG.getConstant(
+          (N0->getFlags().hasNoSignedWrap() && !ISD::isUnsignedIntSetCC(Cond))
+              ? C1.sext(NewVT.getSizeInBits())
+              : C1.zext(NewVT.getSizeInBits()),
+          dl, NewVT);
       return DAG.getSetCC(dl, VT, N0.getOperand(0), NewConst, Cond);
     }
 
@@ -10712,7 +10719,6 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
   SDLoc dl(GA);
 
   ArgListTy Args;
-  ArgListEntry Entry;
   const GlobalValue *GV =
       cast<GlobalValue>(GA->getGlobal()->stripPointerCastsAndAliases());
   SmallString<32> NameString("__emutls_v.");
@@ -10721,9 +10727,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
   const GlobalVariable *EmuTlsVar =
       GV->getParent()->getNamedGlobal(EmuTlsVarName);
   assert(EmuTlsVar && "Cannot find EmuTlsVar ");
-  Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT);
-  Entry.Ty = VoidPtrType;
-  Args.push_back(Entry);
+  Args.emplace_back(DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT), VoidPtrType);
 
   SDValue EmuTlsGetAddr = DAG.getExternalSymbol("__emutls_get_address", PtrVT);
 
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 41e956c..938f2d7 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -967,8 +967,14 @@ bool ShrinkWrapImpl::run(MachineFunction &MF) {
                     << "\nRestore: " << printMBBReference(*Restore) << '\n');
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MFI.setSavePoint(Save);
-  MFI.setRestorePoint(Restore);
+  SmallVector<MachineBasicBlock *, 4> SavePoints;
+  SmallVector<MachineBasicBlock *, 4> RestorePoints;
+  if (Save) {
+    SavePoints.push_back(Save);
+    RestorePoints.push_back(Restore);
+  }
+  MFI.setSavePoints(SavePoints);
+  MFI.setRestorePoints(RestorePoints);
   ++NumCandidates;
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
index decffdc..ff4b568 100644
--- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
+++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -179,7 +179,7 @@ void SwiftErrorValueTracking::propagateVRegs() {
       // Check whether we have a single vreg def from all predecessors.
       // Otherwise we need a phi.
       SmallVector<std::pair<MachineBasicBlock *, Register>, 4> VRegs;
-      SmallSet<const MachineBasicBlock *, 8> Visited;
+      SmallPtrSet<const MachineBasicBlock *, 8> Visited;
       for (auto *Pred : MBB->predecessors()) {
         if (!Visited.insert(Pred).second)
           continue;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index bf4c9f9..350948a 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1738,13 +1738,13 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
                          AttributeList attr,
                          SmallVectorImpl<ISD::OutputArg> &Outs,
                          const TargetLowering &TLI, const DataLayout &DL) {
-  SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, DL, ReturnType, ValueVTs);
-  unsigned NumValues = ValueVTs.size();
+  SmallVector<Type *, 4> Types;
+  ComputeValueTypes(DL, ReturnType, Types);
+  unsigned NumValues = Types.size();
   if (NumValues == 0) return;
 
-  for (unsigned j = 0, f = NumValues; j != f; ++j) {
-    EVT VT = ValueVTs[j];
+  for (Type *Ty : Types) {
+    EVT VT = TLI.getValueType(DL, Ty);
     ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
     if (attr.hasRetAttr(Attribute::SExt))
@@ -1772,7 +1772,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
-      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, 0, 0));
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, Ty, 0, 0));
   }
 }
 
@@ -1893,6 +1893,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case SIToFP:         return ISD::SINT_TO_FP;
   case FPTrunc:        return ISD::FP_ROUND;
   case FPExt:          return ISD::FP_EXTEND;
+  case PtrToAddr:      return ISD::BITCAST;
   case PtrToInt:       return ISD::BITCAST;
   case IntToPtr:       return ISD::BITCAST;
   case BitCast:        return ISD::BITCAST;
@@ -1923,6 +1924,8 @@ int TargetLoweringBase::IntrinsicIDToISD(Intrinsic::ID ID) const {
     return ISD::FEXP;
   case Intrinsic::exp2:
     return ISD::FEXP2;
+  case Intrinsic::log:
+    return ISD::FLOG;
   default:
     return ISD::DELETED_NODE;
   }
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 99ba893..972bd8f 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -99,7 +99,7 @@ unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
   // Set preferred alignment if we are still able to realign the stack
   auto &ST = MF->getSubtarget();
   Align CurrentAlign = ST.getFrameLowering()->getStackAlign();
-  if (Alignment > CurrentAlign && !ST.getRegisterInfo()->canRealignStack(*MF)) {
+  if (Alignment > CurrentAlign && !TRI->canRealignStack(*MF)) {
     Alignment = CurrentAlign;
   }
   int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Alignment);
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
index 80b4185..0df9137 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
@@ -275,7 +275,8 @@ void LVBinaryReader::mapVirtualAddress(const object::COFFObjectFile &COFFObj) {
 }
 
 Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
-                                            StringRef TheFeatures) {
+                                            StringRef TheFeatures,
+                                            StringRef TheCPU) {
   std::string TargetLookupError;
   const Target *TheTarget =
       TargetRegistry::lookupTarget(TheTriple, TargetLookupError);
@@ -298,9 +299,8 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
   MAI.reset(AsmInfo);
 
   // Target subtargets.
-  StringRef CPU;
   MCSubtargetInfo *SubtargetInfo(
-      TheTarget->createMCSubtargetInfo(TheTriple, CPU, TheFeatures));
+      TheTarget->createMCSubtargetInfo(TheTriple, TheCPU, TheFeatures));
   if (!SubtargetInfo)
     return createStringError(errc::invalid_argument,
                              "no subtarget info for target " + TheTriple);
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
index e589551..2ff7081 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
@@ -1190,7 +1190,12 @@ Error LVCodeViewReader::loadTargetInfo(const ObjectFile &Obj) {
     FeaturesValue = SubtargetFeatures();
   }
   FeaturesValue = *Features;
-  return loadGenericTargetInfo(TT.str(), FeaturesValue.getString());
+
+  StringRef CPU;
+  if (auto OptCPU = Obj.tryGetCPUName())
+    CPU = *OptCPU;
+
+  return loadGenericTargetInfo(TT.str(), FeaturesValue.getString(), CPU);
 }
 
 Error LVCodeViewReader::loadTargetInfo(const PDBFile &Pdb) {
@@ -1200,8 +1205,9 @@ Error LVCodeViewReader::loadTargetInfo(const PDBFile &Pdb) {
   TT.setOS(Triple::Win32);
 
   StringRef TheFeature = "";
+  StringRef TheCPU = "";
 
-  return loadGenericTargetInfo(TT.str(), TheFeature);
+  return loadGenericTargetInfo(TT.str(), TheFeature, TheCPU);
 }
 
 std::string LVCodeViewReader::getRegisterName(LVSmall Opcode,
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
index 696e2bc..62134df 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
@@ -956,10 +956,7 @@ LVElement *LVDWARFReader::getElementForOffset(LVOffset Offset,
 Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) {
   // Detect the architecture from the object file. We usually don't need OS
   // info to lookup a target and create register info.
-  Triple TT;
-  TT.setArch(Triple::ArchType(Obj.getArch()));
-  TT.setVendor(Triple::UnknownVendor);
-  TT.setOS(Triple::UnknownOS);
+  Triple TT = Obj.makeTriple();
 
   // Features to be passed to target/subtarget
   Expected<SubtargetFeatures> Features = Obj.getFeatures();
@@ -969,7 +966,12 @@ Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) {
     FeaturesValue = SubtargetFeatures();
   }
   FeaturesValue = *Features;
-  return loadGenericTargetInfo(TT.str(), FeaturesValue.getString());
+
+  StringRef CPU;
+  if (auto OptCPU = Obj.tryGetCPUName())
+    CPU = *OptCPU;
+
+  return loadGenericTargetInfo(TT.str(), FeaturesValue.getString(), CPU);
 }
 
 void LVDWARFReader::mapRangeAddress(const ObjectFile &Obj) {
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
index 1bafed7..ba27aa87 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
@@ -64,7 +64,7 @@ public:
       LLVM_DEBUG({
         dbgs() << "  Preserving debug section " << Sec.getName() << "\n";
       });
-      SmallSet<Block *, 8> PreservedBlocks;
+      SmallPtrSet<Block *, 8> PreservedBlocks;
       for (auto *Sym : Sec.symbols()) {
         bool NewPreservedBlock =
             PreservedBlocks.insert(&Sym->getBlock()).second;
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
index 8e4937d..91a3115 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
@@ -169,7 +169,7 @@ Error EPCIndirectStubsManager::createStubs(const StubInitsMap &StubInits) {
     std::vector<tpctypes::UInt64Write> PtrUpdates;
     for (auto &SI : StubInits)
       PtrUpdates.push_back({(*AvailableStubInfos)[ASIdx++].PointerAddress,
-                            static_cast<uint64_t>(SI.second.first.getValue())});
+                            SI.second.first.getValue()});
     return MemAccess.writeUInt64s(PtrUpdates);
   }
   default:
diff --git a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
index 19c000e..d460cf6 100644
--- a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
@@ -14,40 +14,39 @@
 namespace llvm {
 namespace orc {
 
-ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM,
-                                ThreadSafeContext TSCtx,
-                                GVPredicate ShouldCloneDef,
-                                GVModifier UpdateClonedDefSource) {
-  assert(TSM && "Can not clone null module");
-
-  if (!ShouldCloneDef)
-    ShouldCloneDef = [](const GlobalValue &) { return true; };
-
-  // First copy the source module into a buffer.
+static std::pair<std::string, SmallVector<char, 1>>
+serializeModule(const Module &M, GVPredicate ShouldCloneDef,
+                GVModifier UpdateClonedDefSource) {
   std::string ModuleName;
   SmallVector<char, 1> ClonedModuleBuffer;
-  TSM.withModuleDo([&](Module &M) {
-    ModuleName = M.getModuleIdentifier();
-    std::set<GlobalValue *> ClonedDefsInSrc;
-    ValueToValueMapTy VMap;
-    auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) {
-      if (ShouldCloneDef(*GV)) {
-        ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV));
-        return true;
-      }
-      return false;
-    });
-
-    if (UpdateClonedDefSource)
-      for (auto *GV : ClonedDefsInSrc)
-        UpdateClonedDefSource(*GV);
-
-    BitcodeWriter BCWriter(ClonedModuleBuffer);
-    BCWriter.writeModule(*Tmp);
-    BCWriter.writeSymtab();
-    BCWriter.writeStrtab();
+
+  ModuleName = M.getModuleIdentifier();
+  std::set<GlobalValue *> ClonedDefsInSrc;
+  ValueToValueMapTy VMap;
+  auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) {
+    if (ShouldCloneDef(*GV)) {
+      ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV));
+      return true;
+    }
+    return false;
   });
 
+  if (UpdateClonedDefSource)
+    for (auto *GV : ClonedDefsInSrc)
+      UpdateClonedDefSource(*GV);
+
+  BitcodeWriter BCWriter(ClonedModuleBuffer);
+  BCWriter.writeModule(*Tmp);
+  BCWriter.writeSymtab();
+  BCWriter.writeStrtab();
+
+  return {std::move(ModuleName), std::move(ClonedModuleBuffer)};
+}
+
+ThreadSafeModule
+deserializeModule(std::string ModuleName,
+                  const SmallVector<char, 1> &ClonedModuleBuffer,
+                  ThreadSafeContext TSCtx) {
   MemoryBufferRef ClonedModuleBufferRef(
       StringRef(ClonedModuleBuffer.data(), ClonedModuleBuffer.size()),
       "cloned module buffer");
@@ -63,6 +62,40 @@ ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM,
   return ThreadSafeModule(std::move(M), std::move(TSCtx));
 }
 
+ThreadSafeModule
+cloneExternalModuleToContext(const Module &M, ThreadSafeContext TSCtx,
+                             GVPredicate ShouldCloneDef,
+                             GVModifier UpdateClonedDefSource) {
+
+  if (!ShouldCloneDef)
+    ShouldCloneDef = [](const GlobalValue &) { return true; };
+
+  auto [ModuleName, ClonedModuleBuffer] = serializeModule(
+      M, std::move(ShouldCloneDef), std::move(UpdateClonedDefSource));
+
+  return deserializeModule(std::move(ModuleName), ClonedModuleBuffer,
+                           std::move(TSCtx));
+}
+
+ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM,
+                                ThreadSafeContext TSCtx,
+                                GVPredicate ShouldCloneDef,
+                                GVModifier UpdateClonedDefSource) {
+  assert(TSM && "Can not clone null module");
+
+  if (!ShouldCloneDef)
+    ShouldCloneDef = [](const GlobalValue &) { return true; };
+
+  // First copy the source module into a buffer.
+  auto [ModuleName, ClonedModuleBuffer] = TSM.withModuleDo([&](Module &M) {
+    return serializeModule(M, std::move(ShouldCloneDef),
+                           std::move(UpdateClonedDefSource));
+  });
+
+  return deserializeModule(std::move(ModuleName), ClonedModuleBuffer,
+                           std::move(TSCtx));
+}
+
 ThreadSafeModule cloneToNewContext(const ThreadSafeModule &TSM,
                                    GVPredicate ShouldCloneDef,
                                    GVModifier UpdateClonedDefSource) {
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 08d6c78..d626803 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -654,11 +654,10 @@ bool RuntimeDyldELF::resolveLoongArch64ShortBranch(
     if (Loc == GlobalSymbolTable.end())
       return false;
     const auto &SymInfo = Loc->second;
-    Address =
-        uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset(
-            SymInfo.getOffset()));
+    Address = Sections[SymInfo.getSectionID()].getLoadAddressWithOffset(
+        SymInfo.getOffset());
   } else {
-    Address = uint64_t(Sections[Value.SectionID].getLoadAddress());
+    Address = Sections[Value.SectionID].getLoadAddress();
   }
   uint64_t Offset = RelI->getOffset();
   uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset);
diff --git a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
index d581311..4539146 100644
--- a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
@@ -76,7 +76,7 @@ BindingInfo BindingInfoBuilder::calculateBindingInfo(
   // remove duplicates
   Binding *NewEnd = llvm::unique(Bindings);
   if (NewEnd != Bindings.end())
-    Bindings.erase(NewEnd);
+    Bindings.erase(NewEnd, Bindings.end());
 
   BindingInfo Info;
 
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
index 574883e..92c62b8 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Frontend/HLSL/HLSLRootSignature.h"
+#include "llvm/Support/DXILABI.h"
 #include "llvm/Support/ScopedPrinter.h"
 
 namespace llvm {
@@ -92,10 +93,9 @@ static raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
-static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
-  OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)),
-                        dxbc::getResourceClasses());
-
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const dxil::ResourceClass &Type) {
+  OS << dxil::getResourceClassName(Type);
   return OS;
 }
 
@@ -153,8 +153,7 @@ raw_ostream &operator<<(raw_ostream &OS, const DescriptorTableClause &Clause) {
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const RootDescriptor &Descriptor) {
-  ClauseType Type = ClauseType(llvm::to_underlying(Descriptor.Type));
-  OS << "Root" << Type << "(" << Descriptor.Reg
+  OS << "Root" << Descriptor.Type << "(" << Descriptor.Reg
      << ", space = " << Descriptor.Space
      << ", visibility = " << Descriptor.Visibility
      << ", flags = " << Descriptor.Flags << ")";
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 1cda308..dece8f1 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Frontend/HLSL/RootSignatureValidations.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/DXILABI.h"
 #include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
@@ -119,9 +120,7 @@ MDNode *MetadataBuilder::BuildRootConstants(const RootConstants &Constants) {
 
 MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) {
   IRBuilder<> Builder(Ctx);
-  StringRef ResName =
-      enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)),
-                      dxbc::getResourceClasses());
+  StringRef ResName = dxil::getResourceClassName(Descriptor.Type);
   assert(!ResName.empty() && "Provided an invalid Resource Class");
   SmallString<7> Name({"Root", ResName});
   Metadata *Operands[] = {
@@ -161,9 +160,7 @@ MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) {
 MDNode *MetadataBuilder::BuildDescriptorTableClause(
     const DescriptorTableClause &Clause) {
   IRBuilder<> Builder(Ctx);
-  StringRef ResName =
-      enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)),
-                      dxbc::getResourceClasses());
+  StringRef ResName = dxil::getResourceClassName(Clause.Type);
   assert(!ResName.empty() && "Provided an invalid Resource Class");
   Metadata *Operands[] = {
       MDString::get(Ctx, ResName),
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
index 9d84aa8..72308a3d 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
@@ -29,7 +29,7 @@ bool verifyRegisterValue(uint32_t RegisterValue) {
 // This Range is reserverved, therefore invalid, according to the spec
 // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#all-the-values-should-be-legal
 bool verifyRegisterSpace(uint32_t RegisterSpace) {
-  return !(RegisterSpace >= 0xFFFFFFF0 && RegisterSpace <= 0xFFFFFFFF);
+  return !(RegisterSpace >= 0xFFFFFFF0);
 }
 
 bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) {
diff --git a/llvm/lib/Frontend/OpenMP/OMP.cpp b/llvm/lib/Frontend/OpenMP/OMP.cpp
index 555e2a6..9e625b8 100644
--- a/llvm/lib/Frontend/OpenMP/OMP.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMP.cpp
@@ -190,7 +190,7 @@ bool isCombinedConstruct(Directive D) {
 }
 
 ArrayRef<unsigned> getOpenMPVersions() {
-  static unsigned Versions[]{31, 40, 45, 50, 51, 52, 60};
+  static unsigned Versions[]{31, 40, 45, 50, 51, 52, 60, 61};
   return Versions;
 }
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ea027e4..e9147a4 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -151,6 +151,18 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
 }
 #endif
 
+/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
+/// debug location to the last instruction in the specified basic block if the
+/// insert point points to the end of the block.
+static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder,
+                                 llvm::IRBuilderBase::InsertPoint IP) {
+  Builder.restoreIP(IP);
+  llvm::BasicBlock *BB = Builder.GetInsertBlock();
+  llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
+  if (!BB->empty() && I == BB->end())
+    Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
+}
+
 static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
   if (T.isAMDGPU()) {
     StringRef Features =
@@ -5918,7 +5930,7 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
     createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
   }
 
-  SmallSet<BasicBlock *, 8> Reachable;
+  SmallPtrSet<BasicBlock *, 8> Reachable;
 
   // Get the basic blocks from the loop in which memref instructions
   // can be found.
@@ -7235,7 +7247,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
           BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
       if (!AfterIP)
         return AfterIP.takeError();
-      Builder.restoreIP(*AfterIP);
+      restoreIPandDebugLoc(Builder, *AfterIP);
 
       if (IfCond)
         return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
@@ -8993,7 +9005,7 @@ Error OpenMPIRBuilder::emitOffloadingArrays(
     ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
     Info.RTArgs.SizesArray = Builder.CreateAlloca(
         SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
-    Builder.restoreIP(CodeGenIP);
+    restoreIPandDebugLoc(Builder, CodeGenIP);
   } else {
     auto *SizesArrayInit = ConstantArray::get(
         ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
@@ -9012,7 +9024,7 @@ Error OpenMPIRBuilder::emitOffloadingArrays(
       AllocaInst *Buffer = Builder.CreateAlloca(
           SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
       Buffer->setAlignment(OffloadSizeAlign);
-      Builder.restoreIP(CodeGenIP);
+      restoreIPandDebugLoc(Builder, CodeGenIP);
       Builder.CreateMemCpy(
           Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
           SizesArrayGbl, OffloadSizeAlign,
@@ -9022,7 +9034,7 @@ Error OpenMPIRBuilder::emitOffloadingArrays(
 
       Info.RTArgs.SizesArray = Buffer;
     }
-    Builder.restoreIP(CodeGenIP);
+    restoreIPandDebugLoc(Builder, CodeGenIP);
   }
 
   // The map types are always constant so we don't need to generate code to
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 7159107..e200f36 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1311,14 +1311,15 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
     }
     break;
   case 'l':
-    if (Name.starts_with("lifetime.start") ||
-        Name.starts_with("lifetime.end")) {
-      // Unless remangling is required, do not upgrade the function declaration,
-      // but do upgrade the calls.
-      if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F))
-        NewFn = *Result;
-      else
-        NewFn = F;
+    if ((Name.starts_with("lifetime.start") ||
+         Name.starts_with("lifetime.end")) &&
+        F->arg_size() == 2) {
+      Intrinsic::ID IID = Name.starts_with("lifetime.start")
+                              ? Intrinsic::lifetime_start
+                              : Intrinsic::lifetime_end;
+      rename(F);
+      NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID,
+                                                F->getArg(0)->getType());
       return true;
     }
     break;
@@ -5133,21 +5134,20 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end: {
-    Value *Size = CI->getArgOperand(0);
-    Value *Ptr = CI->getArgOperand(1);
-    if (isa<AllocaInst>(Ptr)) {
+    if (CI->arg_size() != 2) {
       DefaultCase();
       return;
     }
 
+    Value *Ptr = CI->getArgOperand(1);
     // Try to strip pointer casts, such that the lifetime works on an alloca.
     Ptr = Ptr->stripPointerCasts();
     if (isa<AllocaInst>(Ptr)) {
       // Don't use NewFn, as we might have looked through an addrspacecast.
       if (NewFn->getIntrinsicID() == Intrinsic::lifetime_start)
-        NewCall = Builder.CreateLifetimeStart(Ptr, cast<ConstantInt>(Size));
+        NewCall = Builder.CreateLifetimeStart(Ptr);
       else
-        NewCall = Builder.CreateLifetimeEnd(Ptr, cast<ConstantInt>(Size));
+        NewCall = Builder.CreateLifetimeEnd(Ptr);
       break;
     }
 
@@ -5391,7 +5391,7 @@ void llvm::UpgradeNVVMAnnotations(Module &M) {
     return;
 
   SmallVector<MDNode *, 8> NewNodes;
-  SmallSet<const MDNode *, 8> SeenNodes;
+  SmallPtrSet<const MDNode *, 8> SeenNodes;
   for (MDNode *MD : NamedMD->operands()) {
     if (!SeenNodes.insert(MD).second)
       continue;
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index d4ad21e..6b202ba 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -254,6 +254,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     return FoldBitCast(V, DestTy);
   case Instruction::AddrSpaceCast:
   case Instruction::IntToPtr:
+  case Instruction::PtrToAddr:
   case Instruction::PtrToInt:
     return nullptr;
   }
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index e09c139..b454c9a 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -829,6 +829,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
   case Instruction::FPTrunc:
   case Instruction::FPExt:
   case Instruction::IntToPtr:
+  case Instruction::PtrToAddr:
   case Instruction::PtrToInt:
   case Instruction::AddrSpaceCast:
     // Conservatively return getFull set.
@@ -871,7 +872,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
   return ConstantRange(Lower.sext(DstTySize), Upper.sext(DstTySize));
 }
 
-ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
+ConstantRange ConstantRange::truncate(uint32_t DstTySize,
+                                      unsigned NoWrapKind) const {
   assert(getBitWidth() > DstTySize && "Not a value truncation");
   if (isEmptySet())
     return getEmpty(DstTySize);
@@ -885,22 +887,36 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
   // We use the non-wrapped set code to analyze the [Lower, MaxValue) part, and
   // then we do the union with [MaxValue, Upper)
   if (isUpperWrapped()) {
-    // If Upper is greater than or equal to MaxValue(DstTy), it covers the whole
-    // truncated range.
-    if (Upper.getActiveBits() > DstTySize || Upper.countr_one() == DstTySize)
+    // If Upper is greater than MaxValue(DstTy), it covers the whole truncated
+    // range.
+    if (Upper.getActiveBits() > DstTySize)
       return getFull(DstTySize);
 
-    Union = ConstantRange(APInt::getMaxValue(DstTySize),Upper.trunc(DstTySize));
-    UpperDiv.setAllBits();
-
-    // Union covers the MaxValue case, so return if the remaining range is just
-    // MaxValue(DstTy).
-    if (LowerDiv == UpperDiv)
-      return Union;
+    // For nuw the two parts are: [0, Upper) \/ [Lower, MaxValue(DstTy)]
+    if (NoWrapKind & TruncInst::NoUnsignedWrap) {
+      Union = ConstantRange(APInt::getZero(DstTySize), Upper.trunc(DstTySize));
+      UpperDiv = APInt::getOneBitSet(getBitWidth(), DstTySize);
+    } else {
+      // If Upper is equal to MaxValue(DstTy), it covers the whole truncated
+      // range.
+      if (Upper.countr_one() == DstTySize)
+        return getFull(DstTySize);
+      Union =
+          ConstantRange(APInt::getMaxValue(DstTySize), Upper.trunc(DstTySize));
+      UpperDiv.setAllBits();
+      // Union covers the MaxValue case, so return if the remaining range is
+      // just MaxValue(DstTy).
+      if (LowerDiv == UpperDiv)
+        return Union;
+    }
   }
 
   // Chop off the most significant bits that are past the destination bitwidth.
   if (LowerDiv.getActiveBits() > DstTySize) {
+    // For trunc nuw if LowerDiv is greater than MaxValue(DstTy), the range is
+    // outside the whole truncated range.
+    if (NoWrapKind & TruncInst::NoUnsignedWrap)
+      return Union;
     // Mask to just the signficant bits and subtract from LowerDiv/UpperDiv.
     APInt Adjust = LowerDiv & APInt::getBitsSetFrom(getBitWidth(), DstTySize);
     LowerDiv -= Adjust;
@@ -912,6 +928,10 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
     return ConstantRange(LowerDiv.trunc(DstTySize),
                          UpperDiv.trunc(DstTySize)).unionWith(Union);
 
+  if (!LowerDiv.isZero() && NoWrapKind & TruncInst::NoUnsignedWrap)
+    return ConstantRange(LowerDiv.trunc(DstTySize), APInt::getZero(DstTySize))
+        .unionWith(Union);
+
   // The truncated value wraps around. Check if we can do better than fullset.
   if (UpperDivWidth == DstTySize + 1) {
     // Clear the MSB so that UpperDiv wraps around.
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index a3c725b..c7e3113a 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -1567,6 +1567,7 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
   case Instruction::SIToFP:
   case Instruction::FPToUI:
   case Instruction::FPToSI:
+  case Instruction::PtrToAddr:
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
   case Instruction::BitCast:
@@ -2223,6 +2224,8 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty,
     llvm_unreachable("Invalid cast opcode");
   case Instruction::Trunc:
     return getTrunc(C, Ty, OnlyIfReduced);
+  case Instruction::PtrToAddr:
+    return getPtrToAddr(C, Ty, OnlyIfReduced);
   case Instruction::PtrToInt:
     return getPtrToInt(C, Ty, OnlyIfReduced);
   case Instruction::IntToPtr:
@@ -2280,6 +2283,20 @@ Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
   return getFoldedCast(Instruction::Trunc, C, Ty, OnlyIfReduced);
 }
 
+Constant *ConstantExpr::getPtrToAddr(Constant *C, Type *DstTy,
+                                     bool OnlyIfReduced) {
+  assert(C->getType()->isPtrOrPtrVectorTy() &&
+         "PtrToAddr source must be pointer or pointer vector");
+  assert(DstTy->isIntOrIntVectorTy() &&
+         "PtrToAddr destination must be integer or integer vector");
+  assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
+  if (isa<VectorType>(C->getType()))
+    assert(cast<VectorType>(C->getType())->getElementCount() ==
+               cast<VectorType>(DstTy)->getElementCount() &&
+           "Invalid cast between a different number of vector elements");
+  return getFoldedCast(Instruction::PtrToAddr, C, DstTy, OnlyIfReduced);
+}
+
 Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
                                     bool OnlyIfReduced) {
   assert(C->getType()->isPtrOrPtrVectorTy() &&
@@ -2435,6 +2452,7 @@ bool ConstantExpr::isDesirableCastOp(unsigned Opcode) {
   case Instruction::FPToSI:
     return false;
   case Instruction::Trunc:
+  case Instruction::PtrToAddr:
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
   case Instruction::BitCast:
@@ -2457,6 +2475,7 @@ bool ConstantExpr::isSupportedCastOp(unsigned Opcode) {
   case Instruction::FPToSI:
     return false;
   case Instruction::Trunc:
+  case Instruction::PtrToAddr:
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
   case Instruction::BitCast:
@@ -3401,6 +3420,7 @@ Instruction *ConstantExpr::getAsInstruction() const {
 
   switch (getOpcode()) {
   case Instruction::Trunc:
+  case Instruction::PtrToAddr:
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
   case Instruction::BitCast:
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index f7ef4aa..8b5965b 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2186,6 +2186,11 @@ void LLVMGlobalSetMetadata(LLVMValueRef Global, unsigned Kind,
   unwrap<GlobalObject>(Global)->setMetadata(Kind, unwrap<MDNode>(MD));
 }
 
+void LLVMGlobalAddMetadata(LLVMValueRef Global, unsigned Kind,
+                           LLVMMetadataRef MD) {
+  unwrap<GlobalObject>(Global)->addMetadata(Kind, *unwrap<MDNode>(MD));
+}
+
 void LLVMGlobalEraseMetadata(LLVMValueRef Global, unsigned Kind) {
   unwrap<GlobalObject>(Global)->eraseMetadata(Kind);
 }
@@ -2194,6 +2199,11 @@ void LLVMGlobalClearMetadata(LLVMValueRef Global) {
   unwrap<GlobalObject>(Global)->clearMetadata();
 }
 
+void LLVMGlobalAddDebugInfo(LLVMValueRef Global, LLVMMetadataRef GVE) {
+  unwrap<GlobalVariable>(Global)->addDebugInfo(
+      unwrap<DIGlobalVariableExpression>(GVE));
+}
+
 /*--.. Operations on global variables ......................................--*/
 
 LLVMValueRef LLVMAddGlobal(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index ab8ecee..8e523bc 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1896,29 +1896,8 @@ AssignmentInstRange at::getAssignmentInsts(DIAssignID *ID) {
   return make_range(MapIt->second.begin(), MapIt->second.end());
 }
 
-AssignmentMarkerRange at::getAssignmentMarkers(DIAssignID *ID) {
-  assert(ID && "Expected non-null ID");
-  LLVMContext &Ctx = ID->getContext();
-
-  auto *IDAsValue = MetadataAsValue::getIfExists(Ctx, ID);
-
-  // The ID is only used wrapped in MetadataAsValue(ID), so lets check that
-  // one of those already exists first.
-  if (!IDAsValue)
-    return make_range(Value::user_iterator(), Value::user_iterator());
-
-  return make_range(IDAsValue->user_begin(), IDAsValue->user_end());
-}
-
 void at::deleteAssignmentMarkers(const Instruction *Inst) {
-  auto Range = getAssignmentMarkers(Inst);
-  SmallVector<DbgVariableRecord *> DVRAssigns = getDVRAssignmentMarkers(Inst);
-  if (Range.empty() && DVRAssigns.empty())
-    return;
-  SmallVector<DbgAssignIntrinsic *> ToDelete(Range.begin(), Range.end());
-  for (auto *DAI : ToDelete)
-    DAI->eraseFromParent();
-  for (auto *DVR : DVRAssigns)
+  for (auto *DVR : getDVRAssignmentMarkers(Inst))
     DVR->eraseFromParent();
 }
 
@@ -1936,31 +1915,21 @@ void at::RAUW(DIAssignID *Old, DIAssignID *New) {
 }
 
 void at::deleteAll(Function *F) {
-  SmallVector<DbgAssignIntrinsic *, 12> ToDelete;
-  SmallVector<DbgVariableRecord *, 12> DPToDelete;
   for (BasicBlock &BB : *F) {
     for (Instruction &I : BB) {
-      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      for (DbgVariableRecord &DVR :
+           make_early_inc_range(filterDbgVars(I.getDbgRecordRange())))
         if (DVR.isDbgAssign())
-          DPToDelete.push_back(&DVR);
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
-        ToDelete.push_back(DAI);
-      else
-        I.setMetadata(LLVMContext::MD_DIAssignID, nullptr);
+          DVR.eraseFromParent();
+
+      I.setMetadata(LLVMContext::MD_DIAssignID, nullptr);
     }
   }
-  for (auto *DAI : ToDelete)
-    DAI->eraseFromParent();
-  for (auto *DVR : DPToDelete)
-    DVR->eraseFromParent();
 }
 
-/// FIXME: Remove this wrapper function and call
-/// DIExpression::calculateFragmentIntersect directly.
-template <typename T>
-bool calculateFragmentIntersectImpl(
+bool at::calculateFragmentIntersect(
     const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
-    uint64_t SliceSizeInBits, const T *AssignRecord,
+    uint64_t SliceSizeInBits, const DbgVariableRecord *AssignRecord,
     std::optional<DIExpression::FragmentInfo> &Result) {
   // No overlap if this DbgRecord describes a killed location.
   if (AssignRecord->isKillAddress())
@@ -1989,26 +1958,6 @@ bool calculateFragmentIntersectImpl(
       BitExtractOffsetInBits, VarFrag, Result, OffsetFromLocationInBits);
 }
 
-/// FIXME: Remove this wrapper function and call
-/// DIExpression::calculateFragmentIntersect directly.
-bool at::calculateFragmentIntersect(
-    const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
-    uint64_t SliceSizeInBits, const DbgAssignIntrinsic *DbgAssign,
-    std::optional<DIExpression::FragmentInfo> &Result) {
-  return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits,
-                                        SliceSizeInBits, DbgAssign, Result);
-}
-
-/// FIXME: Remove this wrapper function and call
-/// DIExpression::calculateFragmentIntersect directly.
-bool at::calculateFragmentIntersect(
-    const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
-    uint64_t SliceSizeInBits, const DbgVariableRecord *DVRAssign,
-    std::optional<DIExpression::FragmentInfo> &Result) {
-  return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits,
-                                        SliceSizeInBits, DVRAssign, Result);
-}
-
 /// Update inlined instructions' DIAssignID metadata. We need to do this
 /// otherwise a function inlined more than once into the same function
 /// will cause DIAssignID to be shared by many instructions.
@@ -2029,8 +1978,6 @@ void at::remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map,
   }
   if (auto *ID = I.getMetadata(LLVMContext::MD_DIAssignID))
     I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
-  else if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
-    DAI->setAssignId(GetNewID(DAI->getAssignID()));
 }
 
 /// Collect constant properies (base, size, offset) of \p StoreDest.
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index f1d4549..96065ed 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -57,15 +57,9 @@ DebugVariable::DebugVariable(const DbgVariableRecord *DVR)
 DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line,
                        unsigned Column, uint64_t AtomGroup, uint8_t AtomRank,
                        ArrayRef<Metadata *> MDs, bool ImplicitCode)
-    : MDNode(C, DILocationKind, Storage, MDs)
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
-      ,
-      AtomGroup(AtomGroup), AtomRank(AtomRank)
-#endif
-{
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
+    : MDNode(C, DILocationKind, Storage, MDs), AtomGroup(AtomGroup),
+      AtomRank(AtomRank) {
   assert(AtomRank <= 7 && "AtomRank number should fit in 3 bits");
-#endif
   if (AtomGroup)
     C.updateDILocationAtomGroupWaterline(AtomGroup + 1);
 
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 7b799c7..11d33e2 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -404,6 +404,7 @@ findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases,
       return findBaseObject(CE->getOperand(0), Aliases, Op);
     }
     case Instruction::IntToPtr:
+    case Instruction::PtrToAddr:
     case Instruction::PtrToInt:
     case Instruction::BitCast:
     case Instruction::GetElementPtr:
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 49c6dc7..614c3a9 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -411,28 +411,16 @@ CallInst *IRBuilderBase::CreateFPMinimumReduce(Value *Src) {
   return getReductionIntrinsic(Intrinsic::vector_reduce_fminimum, Src);
 }
 
-CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
+CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr) {
   assert(isa<PointerType>(Ptr->getType()) &&
          "lifetime.start only applies to pointers.");
-  if (!Size)
-    Size = getInt64(-1);
-  else
-    assert(Size->getType() == getInt64Ty() &&
-           "lifetime.start requires the size to be an i64");
-  Value *Ops[] = { Size, Ptr };
-  return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, Ops);
+  return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, {Ptr});
 }
 
-CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
+CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr) {
   assert(isa<PointerType>(Ptr->getType()) &&
          "lifetime.end only applies to pointers.");
-  if (!Size)
-    Size = getInt64(-1);
-  else
-    assert(Size->getType() == getInt64Ty() &&
-           "lifetime.end requires the size to be an i64");
-  Value *Ops[] = { Size, Ptr };
-  return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, Ops);
+  return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, {Ptr});
 }
 
 CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index b7cd12a..5e87b5f 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -26,9 +26,18 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 using namespace llvm;
 
+// FIXME: Flag used for an ablation performance test, Issue #147390. Placing it
+// here because referencing IR should be feasible from anywhere. Will be
+// removed after the ablation test.
+cl::opt<bool> ProfcheckDisableMetadataFixes(
+    "profcheck-disable-metadata-fixes", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Disable metadata propagation fixes discovered through Issue #147390"));
+
 InsertPosition::InsertPosition(Instruction *InsertBefore)
     : InsertAt(InsertBefore ? InsertBefore->getIterator()
                             : InstListType::iterator()) {}
@@ -543,14 +552,19 @@ void Instruction::dropUBImplyingAttrsAndUnknownMetadata(
   CB->removeRetAttrs(UBImplyingAttributes);
 }
 
-void Instruction::dropUBImplyingAttrsAndMetadata() {
+void Instruction::dropUBImplyingAttrsAndMetadata(ArrayRef<unsigned> Keep) {
   // !annotation metadata does not impact semantics.
   // !range, !nonnull and !align produce poison, so they are safe to speculate.
   // !noundef and various AA metadata must be dropped, as it generally produces
   // immediate undefined behavior.
-  unsigned KnownIDs[] = {LLVMContext::MD_annotation, LLVMContext::MD_range,
-                         LLVMContext::MD_nonnull, LLVMContext::MD_align};
-  dropUBImplyingAttrsAndUnknownMetadata(KnownIDs);
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_annotation, LLVMContext::MD_range,
+      LLVMContext::MD_nonnull, LLVMContext::MD_align};
+  SmallVector<unsigned> KeepIDs;
+  KeepIDs.reserve(Keep.size() + std::size(KnownIDs));
+  append_range(KeepIDs, KnownIDs);
+  append_range(KeepIDs, Keep);
+  dropUBImplyingAttrsAndUnknownMetadata(KeepIDs);
 }
 
 bool Instruction::hasUBImplyingAttrs() const {
@@ -817,6 +831,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case UIToFP:        return "uitofp";
   case SIToFP:        return "sitofp";
   case IntToPtr:      return "inttoptr";
+  case PtrToAddr:     return "ptrtoaddr";
   case PtrToInt:      return "ptrtoint";
   case BitCast:       return "bitcast";
   case AddrSpaceCast: return "addrspacecast";
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index b896382..a1751c0 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2798,6 +2798,7 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode,
       return false;
     case Instruction::BitCast:
       return true;  // BitCast never modifies bits.
+    case Instruction::PtrToAddr:
     case Instruction::PtrToInt:
       return DL.getIntPtrType(SrcTy)->getScalarSizeInBits() ==
              DestTy->getScalarSizeInBits();
@@ -2855,26 +2856,29 @@ unsigned CastInst::isEliminableCastPair(
   // same reason.
   const unsigned numCastOps =
     Instruction::CastOpsEnd - Instruction::CastOpsBegin;
+  // clang-format off
   static const uint8_t CastResults[numCastOps][numCastOps] = {
-    // T        F  F  U  S  F  F  P  I  B  A  -+
-    // R  Z  S  P  P  I  I  T  P  2  N  T  S   |
-    // U  E  E  2  2  2  2  R  E  I  T  C  C   +- secondOp
-    // N  X  X  U  S  F  F  N  X  N  2  V  V   |
-    // C  T  T  I  I  P  P  C  T  T  P  T  T  -+
-    {  1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc         -+
-    {  8, 1, 9,99,99, 2,17,99,99,99, 2, 3, 0}, // ZExt           |
-    {  8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt           |
-    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI         |
-    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI         |
-    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP         +- firstOp
-    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP         |
-    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // FPTrunc        |
-    { 99,99,99, 2, 2,99,99, 8, 2,99,99, 4, 0}, // FPExt          |
-    {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt       |
-    { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr       |
-    {  5, 5, 5, 0, 0, 5, 5, 0, 0,16, 5, 1,14}, // BitCast        |
-    {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
+    // T        F  F  U  S  F  F  P  P  I  B  A  -+
+    // R  Z  S  P  P  I  I  T  P  2  2  N  T  S   |
+    // U  E  E  2  2  2  2  R  E  I  A  T  C  C   +- secondOp
+    // N  X  X  U  S  F  F  N  X  N  D  2  V  V   |
+    // C  T  T  I  I  P  P  C  T  T  R  P  T  T  -+
+    {  1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // Trunc         -+
+    {  8, 1, 9,99,99, 2,17,99,99,99,99, 2, 3, 0}, // ZExt           |
+    {  8, 0, 1,99,99, 0, 2,99,99,99,99, 0, 3, 0}, // SExt           |
+    {  0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToUI         |
+    {  0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToSI         |
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // UIToFP         +- firstOp
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // SIToFP         |
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc        |
+    { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt          |
+    {  1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt       |
+    {  1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr      |
+    { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr       |
+    {  5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast        |
+    {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
   };
+  // clang-format on
 
   // TODO: This logic could be encoded into the table above and handled in the
   // switch below.
@@ -3046,6 +3050,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   case SIToFP:        return new SIToFPInst        (S, Ty, Name, InsertBefore);
   case FPToUI:        return new FPToUIInst        (S, Ty, Name, InsertBefore);
   case FPToSI:        return new FPToSIInst        (S, Ty, Name, InsertBefore);
+  case PtrToAddr:     return new PtrToAddrInst     (S, Ty, Name, InsertBefore);
   case PtrToInt:      return new PtrToIntInst      (S, Ty, Name, InsertBefore);
   case IntToPtr:      return new IntToPtrInst      (S, Ty, Name, InsertBefore);
   case BitCast:
@@ -3347,6 +3352,7 @@ CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) {
   case Instruction::FPToSI:
     return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() &&
            SrcEC == DstEC;
+  case Instruction::PtrToAddr:
   case Instruction::PtrToInt:
     if (SrcEC != DstEC)
       return false;
@@ -3460,6 +3466,12 @@ PtrToIntInst::PtrToIntInst(Value *S, Type *Ty, const Twine &Name,
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
 }
 
+PtrToAddrInst::PtrToAddrInst(Value *S, Type *Ty, const Twine &Name,
+                             InsertPosition InsertBefore)
+    : CastInst(Ty, PtrToAddr, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToAddr");
+}
+
 IntToPtrInst::IntToPtrInst(Value *S, Type *Ty, const Twine &Name,
                            InsertPosition InsertBefore)
     : CastInst(Ty, IntToPtr, S, Name, InsertBefore) {
@@ -4427,6 +4439,10 @@ PtrToIntInst *PtrToIntInst::cloneImpl() const {
   return new PtrToIntInst(getOperand(0), getType());
 }
 
+PtrToAddrInst *PtrToAddrInst::cloneImpl() const {
+  return new PtrToAddrInst(getOperand(0), getType());
+}
+
 IntToPtrInst *IntToPtrInst::cloneImpl() const {
   return new IntToPtrInst(getOperand(0), getType());
 }
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index aa2a60e..e03f993 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -312,10 +312,8 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey {
 template <> struct MDNodeKeyImpl<DILocation> {
   Metadata *Scope;
   Metadata *InlinedAt;
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
   uint64_t AtomGroup : 61;
   uint64_t AtomRank : 3;
-#endif
   unsigned Line;
   uint16_t Column;
   bool ImplicitCode;
@@ -323,36 +321,24 @@ template <> struct MDNodeKeyImpl<DILocation> {
   MDNodeKeyImpl(unsigned Line, uint16_t Column, Metadata *Scope,
                 Metadata *InlinedAt, bool ImplicitCode, uint64_t AtomGroup,
                 uint8_t AtomRank)
-      : Scope(Scope), InlinedAt(InlinedAt),
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
-        AtomGroup(AtomGroup), AtomRank(AtomRank),
-#endif
-        Line(Line), Column(Column), ImplicitCode(ImplicitCode) {
-  }
+      : Scope(Scope), InlinedAt(InlinedAt), AtomGroup(AtomGroup),
+        AtomRank(AtomRank), Line(Line), Column(Column),
+        ImplicitCode(ImplicitCode) {}
 
   MDNodeKeyImpl(const DILocation *L)
       : Scope(L->getRawScope()), InlinedAt(L->getRawInlinedAt()),
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
         AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank()),
-#endif
         Line(L->getLine()), Column(L->getColumn()),
-        ImplicitCode(L->isImplicitCode()) {
-  }
+        ImplicitCode(L->isImplicitCode()) {}
 
   bool isKeyOf(const DILocation *RHS) const {
     return Line == RHS->getLine() && Column == RHS->getColumn() &&
            Scope == RHS->getRawScope() && InlinedAt == RHS->getRawInlinedAt() &&
-           ImplicitCode == RHS->isImplicitCode()
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
-           && AtomGroup == RHS->getAtomGroup() &&
-           AtomRank == RHS->getAtomRank();
-#else
-        ;
-#endif
+           ImplicitCode == RHS->isImplicitCode() &&
+           AtomGroup == RHS->getAtomGroup() && AtomRank == RHS->getAtomRank();
   }
 
   unsigned getHashValue() const {
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
     // Hashing AtomGroup and AtomRank substantially impacts performance whether
     // Key Instructions is enabled or not. We can't detect whether it's enabled
     // here cheaply; avoiding hashing zero values is a good approximation. This
@@ -363,7 +349,6 @@ template <> struct MDNodeKeyImpl<DILocation> {
     if (AtomGroup || AtomRank)
       return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode,
                           AtomGroup, (uint8_t)AtomRank);
-#endif
     return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode);
   }
 };
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index b1b5f67..d24263f 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -270,6 +270,18 @@ void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
   I.setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
+SmallVector<uint32_t> downscaleWeights(ArrayRef<uint64_t> Weights,
+                                       std::optional<uint64_t> KnownMaxCount) {
+  uint64_t MaxCount = KnownMaxCount.has_value() ? KnownMaxCount.value()
+                                                : *llvm::max_element(Weights);
+  assert(MaxCount > 0 && "Bad max count");
+  uint64_t Scale = calculateCountScale(MaxCount);
+  SmallVector<uint32_t> DownscaledWeights;
+  for (const auto &ECI : Weights)
+    DownscaledWeights.push_back(scaleBranchCount(ECI, Scale));
+  return DownscaledWeights;
+}
+
 void scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
   assert(T != 0 && "Caller should guarantee");
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index a8e6c79..3c324f2 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -9,6 +9,8 @@
 #include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/ADT/StringTable.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/xxhash.h"
+#include "llvm/TargetParser/ARMTargetParser.h"
 
 #define DEBUG_TYPE "runtime-libcalls-info"
 
@@ -17,51 +19,11 @@ using namespace RTLIB;
 
 #define GET_INIT_RUNTIME_LIBCALL_NAMES
 #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS
+#define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
 #include "llvm/IR/RuntimeLibcalls.inc"
 #undef GET_INIT_RUNTIME_LIBCALL_NAMES
 #undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS
-
-static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT,
-                               FloatABI::ABIType FloatABIType,
-                               EABI EABIVersion) {
-  static const RTLIB::LibcallImpl AAPCS_Libcalls[] = {
-      RTLIB::__aeabi_dadd,        RTLIB::__aeabi_ddiv,
-      RTLIB::__aeabi_dmul,        RTLIB::__aeabi_dsub,
-      RTLIB::__aeabi_dcmpeq__oeq, RTLIB::__aeabi_dcmpeq__une,
-      RTLIB::__aeabi_dcmplt,      RTLIB::__aeabi_dcmple,
-      RTLIB::__aeabi_dcmpge,      RTLIB::__aeabi_dcmpgt,
-      RTLIB::__aeabi_dcmpun,      RTLIB::__aeabi_fadd,
-      RTLIB::__aeabi_fdiv,        RTLIB::__aeabi_fmul,
-      RTLIB::__aeabi_fsub,        RTLIB::__aeabi_fcmpeq__oeq,
-      RTLIB::__aeabi_fcmpeq__une, RTLIB::__aeabi_fcmplt,
-      RTLIB::__aeabi_fcmple,      RTLIB::__aeabi_fcmpge,
-      RTLIB::__aeabi_fcmpgt,      RTLIB::__aeabi_fcmpun,
-      RTLIB::__aeabi_d2iz,        RTLIB::__aeabi_d2uiz,
-      RTLIB::__aeabi_d2lz,        RTLIB::__aeabi_d2ulz,
-      RTLIB::__aeabi_f2iz,        RTLIB::__aeabi_f2uiz,
-      RTLIB::__aeabi_f2lz,        RTLIB::__aeabi_f2ulz,
-      RTLIB::__aeabi_d2f,         RTLIB::__aeabi_d2h,
-      RTLIB::__aeabi_f2d,         RTLIB::__aeabi_i2d,
-      RTLIB::__aeabi_ui2d,        RTLIB::__aeabi_l2d,
-      RTLIB::__aeabi_ul2d,        RTLIB::__aeabi_i2f,
-      RTLIB::__aeabi_ui2f,        RTLIB::__aeabi_l2f,
-      RTLIB::__aeabi_ul2f,        RTLIB::__aeabi_lmul,
-      RTLIB::__aeabi_llsl,        RTLIB::__aeabi_llsr,
-      RTLIB::__aeabi_lasr,        RTLIB::__aeabi_idiv,
-      RTLIB::__aeabi_idivmod,     RTLIB::__aeabi_uidivmod,
-      RTLIB::__aeabi_ldivmod,     RTLIB::__aeabi_uidiv,
-      RTLIB::__aeabi_uldivmod,    RTLIB::__aeabi_f2h,
-      RTLIB::__aeabi_d2h,         RTLIB::__aeabi_h2f,
-      RTLIB::__aeabi_memcpy,      RTLIB::__aeabi_memmove,
-      RTLIB::__aeabi_memset,      RTLIB::__aeabi_memcpy4,
-      RTLIB::__aeabi_memcpy8,     RTLIB::__aeabi_memmove4,
-      RTLIB::__aeabi_memmove8,    RTLIB::__aeabi_memset4,
-      RTLIB::__aeabi_memset8,     RTLIB::__aeabi_memclr,
-      RTLIB::__aeabi_memclr4,     RTLIB::__aeabi_memclr8};
-
-  for (RTLIB::LibcallImpl Impl : AAPCS_Libcalls)
-    Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS);
-}
+#undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
 
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
@@ -69,65 +31,51 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
                                        ExceptionHandling ExceptionModel,
                                        FloatABI::ABIType FloatABI,
                                        EABI EABIVersion, StringRef ABIName) {
-  setTargetRuntimeLibcallSets(TT, FloatABI);
-
-  if (ExceptionModel == ExceptionHandling::SjLj)
-    setLibcallImpl(RTLIB::UNWIND_RESUME, RTLIB::_Unwind_SjLj_Resume);
+  setTargetRuntimeLibcallSets(TT, ExceptionModel, FloatABI, EABIVersion,
+                              ABIName);
 
   if (TT.isARM() || TT.isThumb()) {
-    setARMLibcallNames(*this, TT, FloatABI, EABIVersion);
-    return;
-  }
+    // The half <-> float conversion functions are always soft-float on
+    // non-watchos platforms, but are needed for some targets which use a
+    // hard-float calling convention by default.
+    if (!TT.isWatchABI()) {
+      if (isAAPCS_ABI(TT, ABIName)) {
+        setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS);
+        setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS);
+        setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS);
+      } else {
+        setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS);
+        setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS);
+        setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS);
+      }
+    }
 
-  if (TT.getArch() == Triple::ArchType::msp430) {
-    setLibcallImplCallingConv(RTLIB::__mspabi_mpyll,
-                              CallingConv::MSP430_BUILTIN);
+    return;
   }
 }
 
-RTLIB::LibcallImpl
-RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const {
-  const ArrayRef<uint16_t> RuntimeLibcallNameOffsets(
-      RuntimeLibcallNameOffsetTable);
-
-  iterator_range<ArrayRef<uint16_t>::const_iterator> Range =
-      getRecognizedLibcallImpls(FuncName);
-
-  for (auto I = Range.begin(); I != Range.end(); ++I) {
-    RTLIB::LibcallImpl Impl =
-        static_cast<RTLIB::LibcallImpl>(I - RuntimeLibcallNameOffsets.begin());
-
-    // FIXME: This should not depend on looking up ImplToLibcall, only the list
-    // of libcalls for the module.
-    RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]];
-    if (Recognized != RTLIB::Unsupported)
-      return Recognized;
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+iota_range<RTLIB::LibcallImpl>
+RuntimeLibcallsInfo::libcallImplNameHit(uint16_t NameOffsetEntry,
+                                        uint16_t StrOffset) {
+  int NumAliases = 1;
+  for (uint16_t Entry : ArrayRef(RuntimeLibcallNameOffsetTable)
+                            .drop_front(NameOffsetEntry + 1)) {
+    if (Entry != StrOffset)
+      break;
+    ++NumAliases;
   }
 
-  return RTLIB::Unsupported;
+  RTLIB::LibcallImpl ImplStart = static_cast<RTLIB::LibcallImpl>(
+      &RuntimeLibcallNameOffsetTable[NameOffsetEntry] -
+      &RuntimeLibcallNameOffsetTable[0]);
+  return enum_seq(ImplStart,
+                  static_cast<RTLIB::LibcallImpl>(ImplStart + NumAliases));
 }
 
-iterator_range<ArrayRef<uint16_t>::const_iterator>
-RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) {
-  StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName);
-  if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName)
-    return iterator_range(ArrayRef<uint16_t>());
-
-  uint16_t IndexVal = It.offset().value();
-  const ArrayRef<uint16_t> TableRef(RuntimeLibcallNameOffsetTable);
-
-  ArrayRef<uint16_t>::const_iterator E = TableRef.end();
-  ArrayRef<uint16_t>::const_iterator EntriesBegin =
-      std::lower_bound(TableRef.begin(), E, IndexVal);
-  ArrayRef<uint16_t>::const_iterator EntriesEnd = EntriesBegin;
-
-  while (EntriesEnd != E && *EntriesEnd == IndexVal)
-    ++EntriesEnd;
-
-  assert(EntriesBegin != E &&
-         "libcall found in name table but not offset table");
-
-  return make_range(EntriesBegin, EntriesEnd);
+bool RuntimeLibcallsInfo::isAAPCS_ABI(const Triple &TT, StringRef ABIName) {
+  const ARM::ARMABI TargetABI = ARM::computeTargetABI(TT, ABIName);
+  return TargetABI == ARM::ARM_ABI_AAPCS || TargetABI == ARM::ARM_ABI_AAPCS16;
 }
 
 bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 129ca4a..5928c89 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -747,34 +747,28 @@ const Value *Value::stripAndAccumulateConstantOffsets(
       // means when we construct GEPOffset, we need to use the size
       // of GEP's pointer type rather than the size of the original
       // pointer type.
-      unsigned CurBitWidth = DL.getIndexTypeSizeInBits(V->getType());
-      if (CurBitWidth == BitWidth) {
-        if (!GEP->accumulateConstantOffset(DL, Offset, ExternalAnalysis))
-          return V;
-      } else {
-        APInt GEPOffset(CurBitWidth, 0);
-        if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
-          return V;
+      APInt GEPOffset(DL.getIndexTypeSizeInBits(V->getType()), 0);
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
+        return V;
 
-        // Stop traversal if the pointer offset wouldn't fit in the bit-width
-        // provided by the Offset argument. This can happen due to AddrSpaceCast
-        // stripping.
-        if (GEPOffset.getSignificantBits() > BitWidth)
-          return V;
+      // Stop traversal if the pointer offset wouldn't fit in the bit-width
+      // provided by the Offset argument. This can happen due to AddrSpaceCast
+      // stripping.
+      if (GEPOffset.getSignificantBits() > BitWidth)
+        return V;
 
-        // External Analysis can return a result higher/lower than the value
-        // represents. We need to detect overflow/underflow.
-        APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
-        if (!ExternalAnalysis) {
-          Offset += GEPOffsetST;
-        } else {
-          bool Overflow = false;
-          APInt OldOffset = Offset;
-          Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
-          if (Overflow) {
-            Offset = OldOffset;
-            return V;
-          }
+      // External Analysis can return a result higher/lower than the value
+      // represents. We need to detect overflow/underflow.
+      APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
+      if (!ExternalAnalysis) {
+        Offset += GEPOffsetST;
+      } else {
+        bool Overflow = false;
+        APInt OldOffset = Offset;
+        Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
+        if (Overflow) {
+          Offset = OldOffset;
+          return V;
         }
       }
       V = GEP->getPointerOperand();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index ca3f148..9d9b51d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -566,6 +566,8 @@ private:
   void visitUIToFPInst(UIToFPInst &I);
   void visitSIToFPInst(SIToFPInst &I);
   void visitIntToPtrInst(IntToPtrInst &I);
+  void checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V);
+  void visitPtrToAddrInst(PtrToAddrInst &I);
   void visitPtrToIntInst(PtrToIntInst &I);
   void visitBitCastInst(BitCastInst &I);
   void visitAddrSpaceCastInst(AddrSpaceCastInst &I);
@@ -834,6 +836,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
           &GV);
     Check(GV.getInitializer()->getType()->isSized(),
           "Global variable initializer must be sized", &GV);
+    visitConstantExprsRecursively(GV.getInitializer());
     // If the global has common linkage, it must have a zero initializer and
     // cannot be constant.
     if (GV.hasCommonLinkage()) {
@@ -2610,6 +2613,8 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
     Check(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0),
                                 CE->getType()),
           "Invalid bitcast", CE);
+  else if (CE->getOpcode() == Instruction::PtrToAddr)
+    checkPtrToAddr(CE->getOperand(0)->getType(), CE->getType(), *CE);
 }
 
 void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) {
@@ -3532,6 +3537,28 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) {
   visitInstruction(I);
 }
 
+void Verifier::checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V) {
+  Check(SrcTy->isPtrOrPtrVectorTy(), "PtrToAddr source must be pointer", V);
+  Check(DestTy->isIntOrIntVectorTy(), "PtrToAddr result must be integral", V);
+  Check(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToAddr type mismatch",
+        V);
+
+  if (SrcTy->isVectorTy()) {
+    auto *VSrc = cast<VectorType>(SrcTy);
+    auto *VDest = cast<VectorType>(DestTy);
+    Check(VSrc->getElementCount() == VDest->getElementCount(),
+          "PtrToAddr vector length mismatch", V);
+  }
+
+  Type *AddrTy = DL.getAddressType(SrcTy);
+  Check(AddrTy == DestTy, "PtrToAddr result must be address width", V);
+}
+
+void Verifier::visitPtrToAddrInst(PtrToAddrInst &I) {
+  checkPtrToAddr(I.getOperand(0)->getType(), I.getType(), I);
+  visitInstruction(I);
+}
+
 void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
   // Get the source and destination types
   Type *SrcTy = I.getOperand(0)->getType();
@@ -3547,7 +3574,7 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
     auto *VSrc = cast<VectorType>(SrcTy);
     auto *VDest = cast<VectorType>(DestTy);
     Check(VSrc->getElementCount() == VDest->getElementCount(),
-          "PtrToInt Vector width mismatch", &I);
+          "PtrToInt Vector length mismatch", &I);
   }
 
   visitInstruction(I);
@@ -3567,7 +3594,7 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
     auto *VSrc = cast<VectorType>(SrcTy);
     auto *VDest = cast<VectorType>(DestTy);
     Check(VSrc->getElementCount() == VDest->getElementCount(),
-          "IntToPtr Vector width mismatch", &I);
+          "IntToPtr Vector length mismatch", &I);
   }
   visitInstruction(I);
 }
@@ -4609,7 +4636,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
     }
 
     // The edge may exit from zero or more nested pads.
-    SmallSet<Value *, 8> Seen;
+    SmallPtrSet<Value *, 8> Seen;
     for (;; FromPad = getParentPad(FromPad)) {
       Check(FromPad != ToPad,
             "EH pad cannot handle exceptions raised within it", FromPad, TI);
@@ -4737,7 +4764,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
   User *FirstUser = nullptr;
   Value *FirstUnwindPad = nullptr;
   SmallVector<FuncletPadInst *, 8> Worklist({&FPI});
-  SmallSet<FuncletPadInst *, 8> Seen;
+  SmallPtrSet<FuncletPadInst *, 8> Seen;
 
   while (!Worklist.empty()) {
     FuncletPadInst *CurrentPad = Worklist.pop_back_val();
@@ -6612,6 +6639,36 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "Value for inactive lanes must be a VGPR function argument", &Call);
     break;
   }
+  case Intrinsic::amdgcn_call_whole_wave: {
+    auto F = dyn_cast<Function>(Call.getArgOperand(0));
+    Check(F, "Indirect whole wave calls are not allowed", &Call);
+
+    CallingConv::ID CC = F->getCallingConv();
+    Check(CC == CallingConv::AMDGPU_Gfx_WholeWave,
+          "Callee must have the amdgpu_gfx_whole_wave calling convention",
+          &Call);
+
+    Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call);
+
+    Check(Call.arg_size() == F->arg_size(),
+          "Call argument count must match callee argument count", &Call);
+
+    // The first argument of the call is the callee, and the first argument of
+    // the callee is the active mask. The rest of the arguments must match.
+    Check(F->arg_begin()->getType()->isIntegerTy(1),
+          "Callee must have i1 as its first argument", &Call);
+    for (auto [CallArg, FuncArg] :
+         drop_begin(zip_equal(Call.args(), F->args()))) {
+      Check(CallArg->getType() == FuncArg.getType(),
+            "Argument types must match", &Call);
+
+      // Check that inreg attributes match between call site and function
+      Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) ==
+                FuncArg.hasInRegAttr(),
+            "Argument inreg attributes must match", &Call);
+    }
+    break;
+  }
   case Intrinsic::amdgcn_s_prefetch_data: {
     Check(
         AMDGPU::isFlatGlobalAddrSpace(
@@ -6770,7 +6827,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   }
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end: {
-    Value *Ptr = Call.getArgOperand(1);
+    Value *Ptr = Call.getArgOperand(0);
     Check(isa<AllocaInst>(Ptr) || isa<PoisonValue>(Ptr),
           "llvm.lifetime.start/end can only be used on alloca or poison",
           &Call);
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 0323b4d..35d24c1 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1422,7 +1422,7 @@ SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
 
   for (RTLIB::LibcallImpl Impl : LibcallImpls) {
     if (Impl != RTLIB::Unsupported)
-      LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl));
+      LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data());
   }
 
   return LibcallSymbols;
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a466ce5..d6c15de 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1133,8 +1133,11 @@ void IRLinker::linkNamedMDNodes() {
 
     NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
     // Add Src elements into Dest node.
-    for (const MDNode *Op : NMD.operands())
-      DestNMD->addOperand(Mapper.mapMDNode(*Op));
+    for (const MDNode *Op : NMD.operands()) {
+      MDNode *MD = Mapper.mapMDNode(*Op);
+      if (!is_contained(DestNMD->operands(), MD))
+        DestNMD->addOperand(MD);
+    }
   }
 }
 
diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt
index 18a85b3..1e1d0a6 100644
--- a/llvm/lib/MC/CMakeLists.txt
+++ b/llvm/lib/MC/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_component_library(LLVMMC
   MCSection.cpp
   MCSectionMachO.cpp
   MCStreamer.cpp
+  MCSFrame.cpp
   MCSPIRVStreamer.cpp
   MCSubtargetInfo.cpp
   MCSymbol.cpp
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 8f3814a..759d3e0 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -541,12 +541,12 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) {
     if (Symbol.isAbsolute()) {
       MSD.SectionIndex = ELF::SHN_ABS;
     } else if (Symbol.isCommon()) {
-      if (Symbol.isTargetCommon()) {
-        MSD.SectionIndex = Symbol.getIndex();
-      } else {
+      auto Shndx = Symbol.getIndex();
+      if (!Shndx) {
         assert(!Local);
-        MSD.SectionIndex = ELF::SHN_COMMON;
+        Shndx = ELF::SHN_COMMON;
       }
+      MSD.SectionIndex = Shndx;
     } else if (Symbol.isUndefined()) {
       if (Symbol.isSignature() && !Symbol.isUsedInReloc()) {
         MSD.SectionIndex = RevGroupMap.lookup(&Symbol);
diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp
index 3b629cd..d68f4af 100644
--- a/llvm/lib/MC/GOFFObjectWriter.cpp
+++ b/llvm/lib/MC/GOFFObjectWriter.cpp
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCSectionGOFF.h"
 #include "llvm/MC/MCSymbolGOFF.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ConvertEBCDIC.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
diff --git a/llvm/lib/MC/MCAsmInfoGOFF.cpp b/llvm/lib/MC/MCAsmInfoGOFF.cpp
index 0a5d1927..092736b 100644
--- a/llvm/lib/MC/MCAsmInfoGOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoGOFF.cpp
@@ -62,6 +62,8 @@ static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode,
     OS << ',';
     OS << "RMODE(";
     switch (Rmode) {
+    case GOFF::ESD_RMODE_None:
+      llvm_unreachable("");
     case GOFF::ESD_RMODE_24:
       OS << "24";
       break;
@@ -71,8 +73,6 @@ static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode,
     case GOFF::ESD_RMODE_64:
       OS << "64";
       break;
-    case GOFF::ESD_RMODE_None:
-      break;
     }
     OS << ')';
   }
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 9a5e070..89e541a 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -28,7 +28,6 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index 5e364e9..1d211a1 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -42,7 +42,6 @@
 #include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SectionKind.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 275e76e..2881d7c 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -29,7 +29,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include <cassert>
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 6cbdf74..21da79b 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -68,6 +68,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
       OS << "\n  Fixup @" << F.getOffset() << " Value:";
       F.getValue()->print(OS, nullptr);
       OS << " Kind:" << F.getKind();
+      if (F.isLinkerRelaxable())
+        OS << " LinkerRelaxable";
     }
   };
 
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 4ac73ab..d505ac6 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/SFrame.h"
 #include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -23,7 +24,6 @@
 #include "llvm/MC/MCSectionSPIRV.h"
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCSectionXCOFF.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -380,6 +380,19 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
   unsigned EHSectionType = T.getArch() == Triple::x86_64
                                ? ELF::SHT_X86_64_UNWIND
                                : ELF::SHT_PROGBITS;
+  switch (T.getArch()) {
+  case Triple::x86_64:
+    SFrameABIArch = sframe::ABI::AMD64EndianLittle;
+    break;
+  case Triple::aarch64:
+    SFrameABIArch = sframe::ABI::AArch64EndianLittle;
+    break;
+  case Triple::aarch64_be:
+    SFrameABIArch = sframe::ABI::AArch64EndianBig;
+    break;
+  default:
+    break;
+  }
 
   // Solaris requires different flags for .eh_frame to seemingly every other
   // platform.
@@ -537,6 +550,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
   EHFrameSection =
       Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags);
 
+  SFrameSection =
+      Ctx->getELFSection(".sframe", ELF::SHT_GNU_SFRAME, ELF::SHF_ALLOC);
+
   CallGraphSection = Ctx->getELFSection(".callgraph", ELF::SHT_PROGBITS, 0);
 
   StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0);
@@ -1064,6 +1080,7 @@ void MCObjectFileInfo::initMCObjectFileInfo(MCContext &MCCtx, bool PIC,
   CompactUnwindDwarfEHFrameOnly = 0;
 
   EHFrameSection = nullptr;             // Created on demand.
+  SFrameSection = nullptr;              // Created on demand.
   CompactUnwindSection = nullptr;       // Used only by selected targets.
   DwarfAccelNamesSection = nullptr;     // Used only by selected targets.
   DwarfAccelObjCSection = nullptr;      // Used only by selected targets.
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index d0c6144..59265bc 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSFrame.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -30,7 +31,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
     : MCStreamer(Context),
       Assembler(std::make_unique<MCAssembler>(
           Context, std::move(TAB), std::move(Emitter), std::move(OW))),
-      EmitEHFrame(true), EmitDebugFrame(false) {
+      EmitEHFrame(true), EmitDebugFrame(false), EmitSFrame(false) {
   assert(Assembler->getBackendPtr() && Assembler->getEmitterPtr());
   IsObj = true;
   setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
@@ -186,6 +187,10 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
 
   if (EmitDebugFrame)
     MCDwarfFrameEmitter::Emit(*this, MAB, false);
+
+  if (EmitSFrame || (getContext().getTargetOptions() &&
+                     getContext().getTargetOptions()->EmitSFrameUnwind))
+    MCSFrameEmitter::emit(*this);
 }
 
 void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) {
@@ -461,11 +466,23 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
   getAssembler().getEmitter().encodeInstruction(Inst, Data, Fixups, STI);
 
   F->Kind = MCFragment::FT_Relaxable;
-  F->STI = &STI;
-  F->HasInstructions = true;
+  F->setHasInstructions(STI);
+
   F->setVarContents(Data);
-  F->setVarFixups(Fixups);
   F->setInst(Inst);
+
+  bool MarkedLinkerRelaxable = false;
+  for (auto &Fixup : Fixups) {
+    if (!Fixup.isLinkerRelaxable() || MarkedLinkerRelaxable)
+      continue;
+    MarkedLinkerRelaxable = true;
+    auto *Sec = F->getParent();
+    if (!Sec->isLinkerRelaxable())
+      Sec->setFirstLinkerRelaxable(F->getLayoutOrder());
+    F->setLinkerRelaxable();
+  }
+  F->setVarFixups(Fixups);
+
   newFragment();
 }
 
diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
index 229b0b8..1bb617b 100644
--- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
@@ -18,7 +18,6 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/SectionKind.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/SMLoc.h"
 #include <cstdint>
 #include <utility>
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 6782c4b..513f3b3 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -22,7 +22,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/SectionKind.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
 #include <cstdint>
diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index 6c2d241..ddfe1e1 100644
--- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -26,7 +26,6 @@
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/Support/Casting.h"
 #include <optional>
 
 using namespace llvm;
diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp
new file mode 100644
index 0000000..447f22e
--- /dev/null
+++ b/llvm/lib/MC/MCSFrame.cpp
@@ -0,0 +1,98 @@
+//===- lib/MC/MCSFrame.cpp - MCSFrame implementation ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSFrame.h"
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+using namespace sframe;
+
+namespace {
+
+// Emitting these field-by-field, instead of constructing the actual structures
+// lets Streamer do target endian-fixups for free.
+
+class SFrameEmitterImpl {
+  MCObjectStreamer &Streamer;
+  ABI SFrameABI;
+  MCSymbol *FDESubSectionStart;
+  MCSymbol *FRESubSectionStart;
+  MCSymbol *FRESubSectionEnd;
+
+public:
+  SFrameEmitterImpl(MCObjectStreamer &Streamer) : Streamer(Streamer) {
+    assert(Streamer.getContext()
+               .getObjectFileInfo()
+               ->getSFrameABIArch()
+               .has_value());
+    SFrameABI = *Streamer.getContext().getObjectFileInfo()->getSFrameABIArch();
+    FDESubSectionStart = Streamer.getContext().createTempSymbol();
+    FRESubSectionStart = Streamer.getContext().createTempSymbol();
+    FRESubSectionEnd = Streamer.getContext().createTempSymbol();
+  }
+
+  void emitPreamble() {
+    Streamer.emitInt16(Magic);
+    Streamer.emitInt8(static_cast<uint8_t>(Version::V2));
+    Streamer.emitInt8(0);
+  }
+
+  void emitHeader() {
+    emitPreamble();
+    // sfh_abi_arch
+    Streamer.emitInt8(static_cast<uint8_t>(SFrameABI));
+    // sfh_cfa_fixed_fp_offset
+    Streamer.emitInt8(0);
+    // sfh_cfa_fixed_ra_offset
+    Streamer.emitInt8(0);
+    // sfh_auxhdr_len
+    Streamer.emitInt8(0);
+    // shf_num_fdes
+    Streamer.emitInt32(0);
+    // shf_num_fres
+    Streamer.emitInt32(0);
+    // shf_fre_len
+    Streamer.emitAbsoluteSymbolDiff(FRESubSectionEnd, FRESubSectionStart,
+                                    sizeof(int32_t));
+    // shf_fdeoff. With no sfh_auxhdr, these immediately follow this header.
+    Streamer.emitInt32(0);
+    // shf_freoff
+    Streamer.emitAbsoluteSymbolDiff(FRESubSectionStart, FDESubSectionStart,
+                                    sizeof(uint32_t));
+  }
+
+  void emitFDEs() { Streamer.emitLabel(FDESubSectionStart); }
+
+  void emitFREs() {
+    Streamer.emitLabel(FRESubSectionStart);
+    Streamer.emitLabel(FRESubSectionEnd);
+  }
+};
+
+} // end anonymous namespace
+
+void MCSFrameEmitter::emit(MCObjectStreamer &Streamer) {
+  MCContext &Context = Streamer.getContext();
+  SFrameEmitterImpl Emitter(Streamer);
+
+  MCSection *Section = Context.getObjectFileInfo()->getSFrameSection();
+  // Not strictly necessary, but gas always aligns to 8, so match that.
+  Section->ensureMinAlignment(Align(8));
+  Streamer.switchSection(Section);
+  MCSymbol *SectionStart = Context.createTempSymbol();
+  Streamer.emitLabel(SectionStart);
+  Emitter.emitHeader();
+  Emitter.emitFDEs();
+  Emitter.emitFREs();
+}
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 9ed6fd1..a668e79 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -39,6 +39,8 @@ LLVM_DUMP_METHOD void MCSection::dump(
   raw_ostream &OS = errs();
 
   OS << "MCSection Name:" << getName();
+  if (isLinkerRelaxable())
+    OS << " FirstLinkerRelaxable:" << firstLinkerRelaxable();
   for (auto &F : *this) {
     OS << '\n';
     F.dump();
diff --git a/llvm/lib/MC/MCSymbol.cpp b/llvm/lib/MC/MCSymbol.cpp
index 8192896..b868738 100644
--- a/llvm/lib/MC/MCSymbol.cpp
+++ b/llvm/lib/MC/MCSymbol.cpp
@@ -20,6 +20,10 @@
 
 using namespace llvm;
 
+// There are numerous MCSymbol objects, so keeping sizeof(MCSymbol) small is
+// crucial for minimizing peak memory usage.
+static_assert(sizeof(MCSymbol) <= 24, "Keep the base symbol small");
+
 // Only the address of this fragment is ever actually used.
 static MCFragment SentinelFragment;
 
@@ -44,13 +48,12 @@ void *MCSymbol::operator new(size_t s, const MCSymbolTableEntry *Name,
 }
 
 void MCSymbol::setVariableValue(const MCExpr *Value) {
-  assert(Value && "Invalid variable value!");
-  assert((SymbolContents == SymContentsUnset ||
-          SymbolContents == SymContentsVariable) &&
-         "Cannot give common/offset symbol a variable value");
+  assert(Value && "Invalid equated expression");
+  assert((kind == Kind::Regular || kind == Kind::Equated) &&
+         "Cannot equate a common symbol");
   this->Value = Value;
-  SymbolContents = SymContentsVariable;
-  setUndefined();
+  kind = Kind::Equated;
+  Fragment = nullptr;
 }
 
 void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index 9c8b224..070b3d9 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -22,7 +22,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index a45936b..2e632de 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -30,7 +30,6 @@
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index a0e3dba..684e05a 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -21,7 +21,6 @@
 #include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/MCXCOFFObjectWriter.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index 13917ba..fce6b2a 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -20,7 +20,6 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCXCOFFObjectWriter.h"
 #include "llvm/MC/StringTableBuilder.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
diff --git a/llvm/lib/MCA/Instruction.cpp b/llvm/lib/MCA/Instruction.cpp
index d4adfce..7966708 100644
--- a/llvm/lib/MCA/Instruction.cpp
+++ b/llvm/lib/MCA/Instruction.cpp
@@ -128,6 +128,13 @@ void WriteState::dump() const {
 }
 #endif
 
+#ifndef NDEBUG
+void ReadState::dump() const {
+  dbgs() << "{ OpIdx=" << RD->OpIndex << ", RegID " << getRegisterID()
+         << ", Cycles Left=" << CyclesLeft << " }";
+}
+#endif
+
 const CriticalDependency &Instruction::computeCriticalRegDep() {
   if (CriticalRegDep.Cycles)
     return CriticalRegDep;
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 0f19495..0043f02 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -46,7 +46,7 @@ static cl::opt<bool> DisableBitcodeVersionUpgrade(
     "disable-bitcode-version-upgrade", cl::Hidden,
     cl::desc("Disable automatic bitcode upgrade for version mismatch"));
 
-static const char *PreservedSymbols[] = {
+static constexpr StringLiteral PreservedSymbols[] = {
     // There are global variables, so put it here instead of in
     // RuntimeLibcalls.td.
     // TODO: Are there similar such variables?
@@ -54,6 +54,10 @@ static const char *PreservedSymbols[] = {
     "__stack_chk_guard",
 };
 
+static bool isPreservedGlobalVarName(StringRef Name) {
+  return PreservedSymbols[0] == Name || PreservedSymbols[1] == Name;
+}
+
 namespace {
 
 const char *getExpectedProducerName() {
@@ -81,12 +85,16 @@ struct Builder {
   // The StringTableBuilder does not create a copy of any strings added to it,
   // so this provides somewhere to store any strings that we create.
   Builder(SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder,
-          BumpPtrAllocator &Alloc)
-      : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {}
+          BumpPtrAllocator &Alloc, const Triple &TT)
+      : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc), TT(TT),
+        Libcalls(TT) {}
 
   DenseMap<const Comdat *, int> ComdatMap;
   Mangler Mang;
-  Triple TT;
+  const Triple &TT;
+
+  // FIXME: This shouldn't be here.
+  RTLIB::RuntimeLibcallsInfo Libcalls;
 
   std::vector<storage::Comdat> Comdats;
   std::vector<storage::Module> Mods;
@@ -98,6 +106,10 @@ struct Builder {
 
   std::vector<storage::Str> DependentLibraries;
 
+  bool isPreservedLibFuncName(StringRef Name) {
+    return Libcalls.getSupportedLibcallImpl(Name) != RTLIB::Unsupported;
+  }
+
   void setStr(storage::Str &S, StringRef Value) {
     S.Offset = StrtabBuilder.add(Value);
     S.Size = Value.size();
@@ -213,19 +225,6 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
   return P.first->second;
 }
 
-static StringSet<> buildPreservedSymbolsSet(const Triple &TT) {
-  StringSet<> PreservedSymbolSet;
-  PreservedSymbolSet.insert(std::begin(PreservedSymbols),
-                            std::end(PreservedSymbols));
-  // FIXME: Do we need to pass in ABI fields from TargetOptions?
-  RTLIB::RuntimeLibcallsInfo Libcalls(TT);
-  for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) {
-    if (Impl != RTLIB::Unsupported)
-      PreservedSymbolSet.insert(Libcalls.getLibcallImplName(Impl));
-  }
-  return PreservedSymbolSet;
-}
-
 Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
                          const SmallPtrSet<GlobalValue *, 4> &Used,
                          ModuleSymbolTable::Symbol Msym) {
@@ -279,13 +278,11 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
     return Error::success();
   }
 
-  setStr(Sym.IRName, GV->getName());
-
-  static const StringSet<> PreservedSymbolsSet =
-      buildPreservedSymbolsSet(GV->getParent()->getTargetTriple());
-  bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName());
+  StringRef GVName = GV->getName();
+  setStr(Sym.IRName, GVName);
 
-  if (Used.count(GV) || IsPreservedSymbol)
+  if (Used.count(GV) || isPreservedLibFuncName(GVName) ||
+      isPreservedGlobalVarName(GVName))
     Sym.Flags |= 1 << storage::Symbol::FB_used;
   if (GV->isThreadLocal())
     Sym.Flags |= 1 << storage::Symbol::FB_tls;
@@ -352,7 +349,6 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
   setStr(Hdr.Producer, kExpectedProducerName);
   setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple().str());
   setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName());
-  TT = IRMods[0]->getTargetTriple();
 
   for (auto *M : IRMods)
     if (Error Err = addModule(M))
@@ -378,7 +374,8 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
 Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab,
                       StringTableBuilder &StrtabBuilder,
                       BumpPtrAllocator &Alloc) {
-  return Builder(Symtab, StrtabBuilder, Alloc).build(Mods);
+  const Triple &TT = Mods[0]->getTargetTriple();
+  return Builder(Symtab, StrtabBuilder, Alloc, TT).build(Mods);
 }
 
 // Upgrade a vector of bitcode modules created by an old version of LLVM by
diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp
index 5863490..759b579 100644
--- a/llvm/lib/Object/SFrameParser.cpp
+++ b/llvm/lib/Object/SFrameParser.cpp
@@ -32,14 +32,25 @@ getDataSlice(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Size) {
 }
 
 template <typename T>
-static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
-                                          uint64_t Offset) {
+static Expected<ArrayRef<T>>
+getDataSliceAsArrayOf(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Count) {
   static_assert(std::is_trivial_v<T>);
-  Expected<ArrayRef<uint8_t>> Slice = getDataSlice(Data, Offset, sizeof(T));
+  Expected<ArrayRef<uint8_t>> Slice =
+      getDataSlice(Data, Offset, sizeof(T) * Count);
   if (!Slice)
     return Slice.takeError();
 
-  return *reinterpret_cast<const T *>(Slice->data());
+  return ArrayRef(reinterpret_cast<const T *>(Slice->data()), Count);
+}
+
+template <typename T>
+static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
+                                          uint64_t Offset) {
+  Expected<ArrayRef<T>> Array = getDataSliceAsArrayOf<T>(Data, Offset, 1);
+  if (!Array)
+    return Array.takeError();
+
+  return Array->front();
 }
 
 template <endianness E>
@@ -87,17 +98,134 @@ uint64_t SFrameParser<E>::getAbsoluteStartAddress(
   uint64_t Result = SectionAddress + FDE->StartAddress;
 
   if ((getPreamble().Flags.value() & sframe::Flags::FDEFuncStartPCRel) ==
-      sframe::Flags::FDEFuncStartPCRel) {
-    uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data());
-    uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE);
+      sframe::Flags::FDEFuncStartPCRel)
+    Result += offsetOf(FDE);
+
+  return Result;
+}
 
-    assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() &&
-           "Iterator does not belong to this object!");
+template <endianness E>
+uint64_t SFrameParser<E>::offsetOf(typename FDERange::iterator FDE) const {
+  uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data());
+  uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE);
+
+  assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() &&
+         "Iterator does not belong to this object!");
+  return FDEPtr - DataPtr;
+}
 
-    Result += FDEPtr - DataPtr;
+template <typename EndianT>
+static Error readArray(ArrayRef<uint8_t> Data, uint64_t Count, uint64_t &Offset,
+                       SmallVectorImpl<int32_t> &Vec) {
+  Expected<ArrayRef<EndianT>> RawArray =
+      getDataSliceAsArrayOf<EndianT>(Data, Offset, Count);
+  if (!RawArray)
+    return RawArray.takeError();
+  Offset += Count * sizeof(EndianT);
+  Vec.resize(Count);
+  llvm::copy(*RawArray, Vec.begin());
+  return Error::success();
+}
+
+template <typename T, endianness E>
+static Error readFRE(ArrayRef<uint8_t> Data, uint64_t &Offset,
+                     typename SFrameParser<E>::FrameRowEntry &FRE) {
+  Expected<sframe::FrameRowEntry<T, E>> RawFRE =
+      getDataSliceAs<sframe::FrameRowEntry<T, E>>(Data, Offset);
+  if (!RawFRE)
+    return RawFRE.takeError();
+
+  Offset += sizeof(*RawFRE);
+  FRE.StartAddress = RawFRE->StartAddress;
+  FRE.Info.Info = RawFRE->Info.Info;
+
+  switch (FRE.Info.getOffsetSize()) {
+  case sframe::FREOffset::B1:
+    return readArray<sframe::detail::packed<int8_t, E>>(
+        Data, FRE.Info.getOffsetCount(), Offset, FRE.Offsets);
+  case sframe::FREOffset::B2:
+    return readArray<sframe::detail::packed<int16_t, E>>(
+        Data, FRE.Info.getOffsetCount(), Offset, FRE.Offsets);
+  case sframe::FREOffset::B4:
+    return readArray<sframe::detail::packed<int32_t, E>>(
+        Data, FRE.Info.getOffsetCount(), Offset, FRE.Offsets);
   }
+  return createError(formatv("unsupported FRE offset size {0} at offset {1:x+}",
+                             static_cast<unsigned>(FRE.Info.getOffsetSize()),
+                             Offset));
+}
 
-  return Result;
+template <endianness E> Error SFrameParser<E>::FallibleFREIterator::inc() {
+  if (++Idx == Size)
+    return Error::success();
+
+  switch (FREType) {
+  case sframe::FREType::Addr1:
+    return readFRE<uint8_t, E>(Data, Offset, FRE);
+  case sframe::FREType::Addr2:
+    return readFRE<uint16_t, E>(Data, Offset, FRE);
+  case sframe::FREType::Addr4:
+    return readFRE<uint32_t, E>(Data, Offset, FRE);
+  }
+  return createError(formatv("unsupported FRE type {0} at offset {1:x+}",
+                             static_cast<unsigned>(FREType), Offset));
+}
+
+template <endianness E>
+iterator_range<typename SFrameParser<E>::fre_iterator>
+SFrameParser<E>::fres(const sframe::FuncDescEntry<E> &FDE, Error &Err) const {
+  uint64_t Offset = getFREBase() + FDE.StartFREOff;
+  fre_iterator BeforeBegin = make_fallible_itr(
+      FallibleFREIterator(Data, FDE.Info.getFREType(), -1, FDE.NumFREs, Offset),
+      Err);
+  fre_iterator End = make_fallible_end(
+      FallibleFREIterator(Data, FDE.Info.getFREType(), FDE.NumFREs, FDE.NumFREs,
+                          /*Offset=*/0));
+  return {++BeforeBegin, End};
+}
+
+static std::optional<int32_t> getOffset(ArrayRef<int32_t> Offsets, size_t Idx) {
+  if (Offsets.size() > Idx)
+    return Offsets[Idx];
+  return std::nullopt;
+}
+
+// The interpretation of offsets is ABI-specific. The implementation of this and
+// the following functions may need to be adjusted when adding support for a new
+// ABI.
+template <endianness E>
+std::optional<int32_t>
+SFrameParser<E>::getCFAOffset(const FrameRowEntry &FRE) const {
+  return getOffset(FRE.Offsets, 0);
+}
+
+template <endianness E>
+std::optional<int32_t>
+SFrameParser<E>::getRAOffset(const FrameRowEntry &FRE) const {
+  if (usesFixedRAOffset())
+    return Header.CFAFixedRAOffset;
+  return getOffset(FRE.Offsets, 1);
+}
+
+template <endianness E>
+std::optional<int32_t>
+SFrameParser<E>::getFPOffset(const FrameRowEntry &FRE) const {
+  if (usesFixedFPOffset())
+    return Header.CFAFixedFPOffset;
+  return getOffset(FRE.Offsets, usesFixedRAOffset() ? 1 : 2);
+}
+
+template <endianness E>
+ArrayRef<int32_t>
+SFrameParser<E>::getExtraOffsets(const FrameRowEntry &FRE) const {
+  size_t UsedOffsets = 1; // CFA
+  if (!usesFixedRAOffset())
+    ++UsedOffsets;
+  if (!usesFixedFPOffset())
+    ++UsedOffsets;
+  if (FRE.Offsets.size() > UsedOffsets)
+    return ArrayRef<int32_t>(FRE.Offsets).drop_front(UsedOffsets);
+  return {};
 }
 
 template class LLVM_EXPORT_TEMPLATE llvm::object::SFrameParser<endianness::big>;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f810368..b7edeea 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -124,7 +124,6 @@
 #include "llvm/CodeGen/MachineCopyPropagation.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineLICM.h"
 #include "llvm/CodeGen/MachineLateInstrsCleanup.h"
 #include "llvm/CodeGen/MachinePassManager.h"
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index cdf4412..fc2577e 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -519,7 +519,7 @@ Error InstrProfSymtab::create(SectionRef &Section) {
   return Error::success();
 }
 
-StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) {
+StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) const {
   if (Pointer < Address)
     return StringRef();
   auto Offset = Pointer - Address;
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 5425729..7885e12 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -684,13 +684,13 @@ Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName,
   return Error::success();
 }
 
-uint64_t InstrProfSymtab::getVTableHashFromAddress(uint64_t Address) {
+uint64_t InstrProfSymtab::getVTableHashFromAddress(uint64_t Address) const {
   // Given a runtime address, look up the hash value in the interval map, and
   // fallback to value 0 if a hash value is not found.
   return VTableAddrMap.lookup(Address, 0);
 }
 
-uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) {
+uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) const {
   finalizeSymtab();
   auto It = partition_point(AddrToMD5Map, [=](std::pair<uint64_t, uint64_t> A) {
     return A.first < Address;
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 7ca26aa..a347351 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/ProfileData/InstrProfWriter.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/DataAccessProf.h"
@@ -331,61 +330,34 @@ void InstrProfWriter::addDataAccessProfData(
   DataAccessProfileData = std::move(DataAccessProfDataIn);
 }
 
-void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) {
-  assert(Trace.FunctionNameRefs.size() <= MaxTemporalProfTraceLength);
-  assert(!Trace.FunctionNameRefs.empty());
-  if (TemporalProfTraceStreamSize < TemporalProfTraceReservoirSize) {
-    // Simply append the trace if we have not yet hit our reservoir size limit.
-    TemporalProfTraces.push_back(std::move(Trace));
-  } else {
-    // Otherwise, replace a random trace in the stream.
-    std::uniform_int_distribution<uint64_t> Distribution(
-        0, TemporalProfTraceStreamSize);
-    uint64_t RandomIndex = Distribution(RNG);
-    if (RandomIndex < TemporalProfTraces.size())
-      TemporalProfTraces[RandomIndex] = std::move(Trace);
-  }
-  ++TemporalProfTraceStreamSize;
-}
-
 void InstrProfWriter::addTemporalProfileTraces(
     SmallVectorImpl<TemporalProfTraceTy> &SrcTraces, uint64_t SrcStreamSize) {
+  if (TemporalProfTraces.size() > TemporalProfTraceReservoirSize)
+    TemporalProfTraces.truncate(TemporalProfTraceReservoirSize);
   for (auto &Trace : SrcTraces)
     if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength)
       Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength);
   llvm::erase_if(SrcTraces, [](auto &T) { return T.FunctionNameRefs.empty(); });
-  // Assume that the source has the same reservoir size as the destination to
-  // avoid needing to record it in the indexed profile format.
-  bool IsDestSampled =
-      (TemporalProfTraceStreamSize > TemporalProfTraceReservoirSize);
-  bool IsSrcSampled = (SrcStreamSize > TemporalProfTraceReservoirSize);
-  if (!IsDestSampled && IsSrcSampled) {
-    // If one of the traces are sampled, ensure that it belongs to Dest.
-    std::swap(TemporalProfTraces, SrcTraces);
-    std::swap(TemporalProfTraceStreamSize, SrcStreamSize);
-    std::swap(IsDestSampled, IsSrcSampled);
-  }
-  if (!IsSrcSampled) {
-    // If the source stream is not sampled, we add each source trace normally.
-    for (auto &Trace : SrcTraces)
-      addTemporalProfileTrace(std::move(Trace));
+  // If there are no source traces, it is probably because
+  // --temporal-profile-max-trace-length=0 was set to deliberately remove all
+  // traces. In that case, we do not want to increase the stream size
+  if (SrcTraces.empty())
     return;
-  }
-  // Otherwise, we find the traces that would have been removed if we added
-  // the whole source stream.
-  SmallSetVector<uint64_t, 8> IndicesToReplace;
-  for (uint64_t I = 0; I < SrcStreamSize; I++) {
-    std::uniform_int_distribution<uint64_t> Distribution(
-        0, TemporalProfTraceStreamSize);
+  // Add traces until our reservoir is full or we run out of source traces
+  auto SrcTraceIt = SrcTraces.begin();
+  while (TemporalProfTraces.size() < TemporalProfTraceReservoirSize &&
+         SrcTraceIt < SrcTraces.end())
+    TemporalProfTraces.push_back(*SrcTraceIt++);
+  // Our reservoir is full, we need to sample the source stream
+  llvm::shuffle(SrcTraceIt, SrcTraces.end(), RNG);
+  for (uint64_t I = TemporalProfTraces.size();
+       I < SrcStreamSize && SrcTraceIt < SrcTraces.end(); I++) {
+    std::uniform_int_distribution<uint64_t> Distribution(0, I);
     uint64_t RandomIndex = Distribution(RNG);
     if (RandomIndex < TemporalProfTraces.size())
-      IndicesToReplace.insert(RandomIndex);
-    ++TemporalProfTraceStreamSize;
+      TemporalProfTraces[RandomIndex] = *SrcTraceIt++;
   }
-  // Then we insert a random sample of the source traces.
-  llvm::shuffle(SrcTraces.begin(), SrcTraces.end(), RNG);
-  for (const auto &[Index, Trace] : llvm::zip(IndicesToReplace, SrcTraces))
-    TemporalProfTraces[Index] = std::move(Trace);
+  TemporalProfTraceStreamSize += SrcStreamSize;
 }
 
 void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW,
diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
index fe34037..70ac68a 100644
--- a/llvm/lib/SandboxIR/Context.cpp
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -256,6 +256,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     case llvm::Instruction::FPToUI:
     case llvm::Instruction::FPToSI:
     case llvm::Instruction::FPExt:
+    case llvm::Instruction::PtrToAddr:
     case llvm::Instruction::PtrToInt:
     case llvm::Instruction::IntToPtr:
     case llvm::Instruction::SIToFP:
diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp
index 956047c..1a81d18 100644
--- a/llvm/lib/SandboxIR/Instruction.cpp
+++ b/llvm/lib/SandboxIR/Instruction.cpp
@@ -1007,6 +1007,9 @@ static llvm::Instruction::CastOps getLLVMCastOp(Instruction::Opcode Opc) {
     return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToSI);
   case Instruction::Opcode::FPExt:
     return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPExt);
+  case Instruction::Opcode::PtrToAddr:
+    return static_cast<llvm::Instruction::CastOps>(
+        llvm::Instruction::PtrToAddr);
   case Instruction::Opcode::PtrToInt:
     return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::PtrToInt);
   case Instruction::Opcode::IntToPtr:
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 3d688a1..d2a417f 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -5519,13 +5519,129 @@ APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
   return opOK;
 }
 
+APFloat::opStatus DoubleAPFloat::convertToSignExtendedInteger(
+    MutableArrayRef<integerPart> Input, unsigned int Width, bool IsSigned,
+    roundingMode RM, bool *IsExact) const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+
+  // If Hi is not finite, or Lo is zero, the value is entirely represented
+  // by Hi. Delegate to the simpler single-APFloat conversion.
+  if (!getFirst().isFiniteNonZero() || getSecond().isZero())
+    return getFirst().convertToInteger(Input, Width, IsSigned, RM, IsExact);
+
+  // First, round the full double-double value to an integral value. This
+  // simplifies the rest of the function, as we no longer need to consider
+  // fractional parts.
+  *IsExact = false;
+  DoubleAPFloat Integral = *this;
+  const opStatus RoundStatus = Integral.roundToIntegral(RM);
+  if (RoundStatus == opInvalidOp)
+    return opInvalidOp;
+  const APFloat &IntegralHi = Integral.getFirst();
+  const APFloat &IntegralLo = Integral.getSecond();
+
+  // If rounding results in either component being zero, the sum is trivial.
+  // Delegate to the simpler single-APFloat conversion.
+  bool HiIsExact;
+  if (IntegralHi.isZero() || IntegralLo.isZero()) {
+    const opStatus HiStatus =
+        IntegralHi.convertToInteger(Input, Width, IsSigned, RM, &HiIsExact);
+    // The conversion from an integer-valued float to an APInt may fail if the
+    // result would be out of range.  Regardless, taking this path is only
+    // possible if rounding occurred during the initial `roundToIntegral`.
+    return HiStatus == opOK ? opInexact : HiStatus;
+  }
+
+  // A negative number cannot be represented by an unsigned integer.
+  // Since a double-double is canonical, if Hi is negative, the sum is negative.
+  if (!IsSigned && IntegralHi.isNegative())
+    return opInvalidOp;
+
+  // Handle the special boundary case where |Hi| is exactly the power of two
+  // that marks the edge of the integer's range (e.g., 2^63 for int64_t). In
+  // this situation, Hi itself won't fit, but the sum Hi + Lo might.
+  // `PositiveOverflowWidth` is the bit number for this boundary (N-1 for
+  // signed, N for unsigned).
+  bool LoIsExact;
+  const int HiExactLog2 = IntegralHi.getExactLog2Abs();
+  const unsigned PositiveOverflowWidth = IsSigned ? Width - 1 : Width;
+  if (HiExactLog2 >= 0 &&
+      static_cast<unsigned>(HiExactLog2) == PositiveOverflowWidth) {
+    // If Hi and Lo have the same sign, |Hi + Lo| > |Hi|, so the sum is
+    // guaranteed to overflow. E.g., for uint128_t, (2^128, 1) overflows.
+    if (IntegralHi.isNegative() == IntegralLo.isNegative())
+      return opInvalidOp;
+
+    // If the signs differ, the sum will fit. We can compute the result using
+    // properties of two's complement arithmetic without a wide intermediate
+    // integer. E.g., for uint128_t, (2^128, -1) should be 2^128 - 1.
+    const opStatus LoStatus = IntegralLo.convertToInteger(
+        Input, Width, /*IsSigned=*/true, RM, &LoIsExact);
+    if (LoStatus == opInvalidOp)
+      return opInvalidOp;
+
+    // Adjust the bit pattern of Lo to account for Hi's value:
+    //  - For unsigned (Hi=2^Width): `2^Width + Lo` in `Width`-bit
+    //    arithmetic is equivalent to just `Lo`. The conversion of `Lo` above
+    //    already produced the correct final bit pattern.
+    //  - For signed (Hi=2^(Width-1)): The sum `2^(Width-1) + Lo` (where Lo<0)
+    //    can be computed by taking the two's complement pattern for `Lo` and
+    //    clearing the sign bit.
+    if (IsSigned && !IntegralHi.isNegative())
+      APInt::tcClearBit(Input.data(), PositiveOverflowWidth);
+    *IsExact = RoundStatus == opOK;
+    return RoundStatus;
+  }
+
+  // Convert Hi into an integer.  This may not fit but that is OK: we know that
+  // Hi + Lo would not fit either in this situation.
+  const opStatus HiStatus = IntegralHi.convertToInteger(
+      Input, Width, IsSigned, rmTowardZero, &HiIsExact);
+  if (HiStatus == opInvalidOp)
+    return HiStatus;
+
+  // Convert Lo into a temporary integer of the same width.
+  APSInt LoResult{Width, /*isUnsigned=*/!IsSigned};
+  const opStatus LoStatus =
+      IntegralLo.convertToInteger(LoResult, rmTowardZero, &LoIsExact);
+  if (LoStatus == opInvalidOp)
+    return LoStatus;
+
+  // Add Lo to Hi. This addition is guaranteed not to overflow because of the
+  // double-double canonicalization rule (`|Lo| <= ulp(Hi)/2`). The only case
+  // where the sum could cross the integer type's boundary is when Hi is a
+  // power of two, which is handled by the special case block above.
+  APInt::tcAdd(Input.data(), LoResult.getRawData(), /*carry=*/0, Input.size());
+
+  *IsExact = RoundStatus == opOK;
+  return RoundStatus;
+}
+
 APFloat::opStatus
 DoubleAPFloat::convertToInteger(MutableArrayRef<integerPart> Input,
                                 unsigned int Width, bool IsSigned,
                                 roundingMode RM, bool *IsExact) const {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
-      .convertToInteger(Input, Width, IsSigned, RM, IsExact);
+  opStatus FS =
+      convertToSignExtendedInteger(Input, Width, IsSigned, RM, IsExact);
+
+  if (FS == opInvalidOp) {
+    const unsigned DstPartsCount = partCountForBits(Width);
+    assert(DstPartsCount <= Input.size() && "Integer too big");
+
+    unsigned Bits;
+    if (getCategory() == fcNaN)
+      Bits = 0;
+    else if (isNegative())
+      Bits = IsSigned;
+    else
+      Bits = Width - IsSigned;
+
+    tcSetLeastSignificantBits(Input.data(), DstPartsCount, Bits);
+    if (isNegative() && IsSigned)
+      APInt::tcShiftLeft(Input.data(), DstPartsCount, Width - 1);
+  }
+
+  return FS;
 }
 
 APFloat::opStatus DoubleAPFloat::convertFromAPInt(const APInt &Input,
@@ -5626,14 +5742,30 @@ bool DoubleAPFloat::getExactInverse(APFloat *inv) const {
   return Ret;
 }
 
-int DoubleAPFloat::getExactLog2() const {
-  // TODO: Implement me
-  return INT_MIN;
-}
-
 int DoubleAPFloat::getExactLog2Abs() const {
-  // TODO: Implement me
-  return INT_MIN;
+  // In order for Hi + Lo to be a power of two, the following must be true:
+  // 1. Hi must be a power of two.
+  // 2. Lo must be zero.
+  if (getSecond().isNonZero())
+    return INT_MIN;
+  return getFirst().getExactLog2Abs();
+}
+
+int ilogb(const DoubleAPFloat &Arg) {
+  const APFloat &Hi = Arg.getFirst();
+  const APFloat &Lo = Arg.getSecond();
+  int IlogbResult = ilogb(Hi);
+  // Zero and non-finite values can delegate to ilogb(Hi).
+  if (Arg.getCategory() != fcNormal)
+    return IlogbResult;
+  // If Lo can't change the binade, we can delegate to ilogb(Hi).
+  if (Lo.isZero() || Hi.isNegative() == Lo.isNegative())
+    return IlogbResult;
+  if (Hi.getExactLog2Abs() == INT_MIN)
+    return IlogbResult;
+  // Numbers of the form 2^a - 2^b or -2^a + 2^b are almost powers of two but
+  // get nudged out of the binade by the low component.
+  return IlogbResult - 1;
 }
 
 DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp,
@@ -5646,10 +5778,101 @@ DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp,
 DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
                     APFloat::roundingMode RM) {
   assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat First = frexp(Arg.Floats[0], Exp, RM);
-  APFloat Second = Arg.Floats[1];
-  if (Arg.getCategory() == APFloat::fcNormal)
-    Second = scalbn(Second, -Exp, RM);
+
+  // Get the unbiased exponent e of the number, where |Arg| = m * 2^e for m in
+  // [1.0, 2.0).
+  Exp = ilogb(Arg);
+
+  // For NaNs, quiet any signaling NaN and return the result, as per standard
+  // practice.
+  if (Exp == APFloat::IEK_NaN) {
+    DoubleAPFloat Quiet{Arg};
+    Quiet.getFirst().makeQuiet();
+    return Quiet;
+  }
+
+  // For infinity, return it unchanged. The exponent remains IEK_Inf.
+  if (Exp == APFloat::IEK_Inf)
+    return Arg;
+
+  // For zero, the fraction is zero and the standard requires the exponent be 0.
+  if (Exp == APFloat::IEK_Zero) {
+    Exp = 0;
+    return Arg;
+  }
+
+  const APFloat &Hi = Arg.getFirst();
+  const APFloat &Lo = Arg.getSecond();
+
+  // frexp requires the fraction's absolute value to be in [0.5, 1.0).
+  // ilogb provides an exponent for an absolute value in [1.0, 2.0).
+  // Increment the exponent to ensure the fraction is in the correct range.
+  ++Exp;
+
+  const bool SignsDisagree = Hi.isNegative() != Lo.isNegative();
+  APFloat Second = Lo;
+  if (Arg.getCategory() == APFloat::fcNormal && Lo.isFiniteNonZero()) {
+    roundingMode LoRoundingMode;
+    // The interpretation of rmTowardZero depends on the sign of the combined
+    // Arg rather than the sign of the component.
+    if (RM == rmTowardZero)
+      LoRoundingMode = Arg.isNegative() ? rmTowardPositive : rmTowardNegative;
+    // For rmNearestTiesToAway, we face a similar problem. If signs disagree,
+    // Lo is a correction *toward* zero relative to Hi. Rounding Lo
+    // "away from zero" based on its own sign would move the value in the
+    // wrong direction. As a safe proxy, we use rmNearestTiesToEven, which is
+    // direction-agnostic. We only need to bother with this if Lo is scaled
+    // down.
+    else if (RM == rmNearestTiesToAway && SignsDisagree && Exp > 0)
+      LoRoundingMode = rmNearestTiesToEven;
+    else
+      LoRoundingMode = RM;
+    Second = scalbn(Lo, -Exp, LoRoundingMode);
+    // The rmNearestTiesToEven proxy is correct most of the time, but it
+    // differs from rmNearestTiesToAway when the scaled value of Lo is an
+    // exact midpoint.
+    // NOTE: This is morally equivalent to roundTiesTowardZero.
+    if (RM == rmNearestTiesToAway && LoRoundingMode == rmNearestTiesToEven) {
+      // Re-scale the result back to check if rounding occurred.
+      const APFloat RecomposedLo = scalbn(Second, Exp, rmNearestTiesToEven);
+      if (RecomposedLo != Lo) {
+        // RoundingError tells us which direction we rounded:
+        //   - RoundingError > 0: we rounded up.
+        //   - RoundingError < 0: we down up.
+        const APFloat RoundingError = RecomposedLo - Lo;
+        // Determine if scalbn(Lo, -Exp) landed exactly on a midpoint.
+        // We do this by checking if the absolute rounding error is exactly
+        // half a ULP of the result.
+        const APFloat UlpOfSecond = harrisonUlp(Second);
+        const APFloat ScaledUlpOfSecond =
+            scalbn(UlpOfSecond, Exp - 1, rmNearestTiesToEven);
+        const bool IsMidpoint = abs(RoundingError) == ScaledUlpOfSecond;
+        const bool RoundedLoAway =
+            Second.isNegative() == RoundingError.isNegative();
+        // The sign of Hi and Lo disagree and we rounded Lo away: we must
+        // decrease the magnitude of Second to increase the magnitude
+        // First+Second.
+        if (IsMidpoint && RoundedLoAway)
+          Second.next(/*nextDown=*/!Second.isNegative());
+      }
+    }
+    // Handle a tricky edge case where Arg is slightly less than a power of two
+    // (e.g., Arg = 2^k - epsilon). In this situation:
+    // 1. Hi is 2^k, and Lo is a small negative value -epsilon.
+    // 2. ilogb(Arg) correctly returns k-1.
+    // 3. Our initial Exp becomes (k-1) + 1 = k.
+    // 4. Scaling Hi (2^k) by 2^-k would yield a magnitude of 1.0 and
+    //    scaling Lo by 2^-k would yield zero. This would make the result 1.0
+    //    which is an invalid fraction, as the required interval is [0.5, 1.0).
+    // We detect this specific case by checking if Hi is a power of two and if
+    // the scaled Lo underflowed to zero. The fix: Increment Exp to k+1. This
+    // adjusts the scale factor, causing Hi to be scaled to 0.5, which is a
+    // valid fraction.
+    if (Second.isZero() && SignsDisagree && Hi.getExactLog2Abs() != INT_MIN)
+      ++Exp;
+  }
+
+  APFloat First = scalbn(Hi, -Exp, RM);
   return DoubleAPFloat(semPPCDoubleDouble, std::move(First), std::move(Second));
 }
 
@@ -5749,10 +5972,6 @@ void APFloat::Profile(FoldingSetNodeID &NID) const {
   NID.Add(bitcastToAPInt());
 }
 
-/* Same as convertToInteger(integerPart*, ...), except the result is returned in
-   an APSInt, whose initial bit-width and signed-ness are used to determine the
-   precision of the conversion.
- */
 APFloat::opStatus APFloat::convertToInteger(APSInt &result,
                                             roundingMode rounding_mode,
                                             bool *isExact) const {
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 954af7f..0c0e1d0 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -1377,7 +1377,7 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
     // the true value, and a "borrow" to the left should be remembered.
     int64_t borrow = 0;
     for (unsigned i = 0; i < n; ++i) {
-      uint64_t p = uint64_t(qp) * uint64_t(v[i]);
+      uint64_t p = qp * uint64_t(v[i]);
       int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p);
       u[j+i] = Lo_32(subres);
       borrow = Hi_32(p) - Hi_32(subres);
@@ -3136,6 +3136,22 @@ APInt APIntOps::mulhu(const APInt &C1, const APInt &C2) {
   return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
 }
 
+APInt APIntOps::mulsExtended(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.sext(FullWidth);
+  APInt C2Ext = C2.sext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
+APInt APIntOps::muluExtended(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.zext(FullWidth);
+  APInt C2Ext = C2.zext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
 APInt APIntOps::pow(const APInt &X, int64_t N) {
   assert(N >= 0 && "negative exponents not supported.");
   APInt Acc = APInt(X.getBitWidth(), 1);
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 10b6101..b7578dd 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -182,6 +182,7 @@ add_llvm_component_library(LLVMSupport
   DivisionByConstantInfo.cpp
   DAGDeltaAlgorithm.cpp
   DJB.cpp
+  DXILABI.cpp
   DynamicAPInt.cpp
   ELFAttributes.cpp
   ELFAttrParserCompact.cpp
diff --git a/llvm/lib/Support/DXILABI.cpp b/llvm/lib/Support/DXILABI.cpp
new file mode 100644
index 0000000..082e320
--- /dev/null
+++ b/llvm/lib/Support/DXILABI.cpp
@@ -0,0 +1,33 @@
+//===-- DXILABI.cpp - ABI Sensitive Values for DXIL -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions of various constants and enums that are
+// required to remain stable as per the DXIL format's requirements.
+//
+// Documentation for DXIL can be found in
+// https://github.com/Microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DXILABI.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+StringRef dxil::getResourceClassName(dxil::ResourceClass RC) {
+  switch (RC) {
+  case dxil::ResourceClass::SRV:
+    return "SRV";
+  case dxil::ResourceClass::UAV:
+    return "UAV";
+  case dxil::ResourceClass::CBuffer:
+    return "CBV";
+  case dxil::ResourceClass::Sampler:
+    return "Sampler";
+  }
+  llvm_unreachable("Invalid ResourceClass enum value");
+}
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 94a04ab..bd08365 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -888,11 +888,19 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
   Res.Zero |= (~BottomKnown).getLoBits(ResultBitsKnown);
   Res.One = BottomKnown.getLoBits(ResultBitsKnown);
 
-  // If we're self-multiplying then bit[1] is guaranteed to be zero.
-  if (NoUndefSelfMultiply && BitWidth > 1) {
-    assert(Res.One[1] == 0 &&
-           "Self-multiplication failed Quadratic Reciprocity!");
-    Res.Zero.setBit(1);
+  if (NoUndefSelfMultiply) {
+    // If X has at least TZ trailing zeroes, then bit (2 * TZ + 1) must be zero.
+    unsigned TwoTZP1 = 2 * TrailZero0 + 1;
+    if (TwoTZP1 < BitWidth)
+      Res.Zero.setBit(TwoTZP1);
+
+    // If X has exactly TZ trailing zeros, then bit (2 * TZ + 2) must also be
+    // zero.
+    if (TrailZero0 < BitWidth && LHS.One[TrailZero0]) {
+      unsigned TwoTZP2 = TwoTZP1 + 1;
+      if (TwoTZP2 < BitWidth)
+        Res.Zero.setBit(TwoTZP2);
+    }
   }
 
   return Res;
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 601f11f..1c4645a 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -501,8 +501,14 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
     std::unique_ptr<MB> Result(
         new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile<MB>(
             RequiresNullTerminator, FD, MapSize, Offset, EC));
-    if (!EC)
-      return std::move(Result);
+    if (!EC) {
+      // On at least Linux, and possibly on other systems, mmap may return pages
+      // from the page cache that are not properly filled with trailing zeroes,
+      // if some prior user of the page wrote non-zero bytes. Detect this and
+      // don't use mmap in that case.
+      if (!RequiresNullTerminator || *Result->getBufferEnd() == '\0')
+        return std::move(Result);
+    }
   }
 
 #ifdef __MVS__
diff --git a/llvm/lib/Support/SmallPtrSet.cpp b/llvm/lib/Support/SmallPtrSet.cpp
index 83143a7..a602165 100644
--- a/llvm/lib/Support/SmallPtrSet.cpp
+++ b/llvm/lib/Support/SmallPtrSet.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemAlloc.h"
 #include <algorithm>
@@ -28,7 +29,7 @@ void SmallPtrSetImplBase::shrink_and_clear() {
   // Reduce the number of buckets.
   unsigned Size = size();
   CurArraySize = Size > 16 ? 1 << (Log2_32_Ceil(Size) + 1) : 32;
-  NumNonEmpty = NumTombstones = 0;
+  NumEntries = NumTombstones = 0;
 
   // Install the new array.  Clear all the buckets to empty.
   CurArray = (const void**)safe_malloc(sizeof(void*) * CurArraySize);
@@ -41,7 +42,8 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) {
   if (LLVM_UNLIKELY(size() * 4 >= CurArraySize * 3)) {
     // If more than 3/4 of the array is full, grow.
     Grow(CurArraySize < 64 ? 128 : CurArraySize * 2);
-  } else if (LLVM_UNLIKELY(CurArraySize - NumNonEmpty < CurArraySize / 8)) {
+  } else if (LLVM_UNLIKELY(CurArraySize - NumEntries - NumTombstones <
+                           CurArraySize / 8)) {
     // If fewer of 1/8 of the array is empty (meaning that many are filled with
     // tombstones), rehash.
     Grow(CurArraySize);
@@ -55,8 +57,7 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) {
   // Otherwise, insert it!
   if (*Bucket == getTombstoneMarker())
     --NumTombstones;
-  else
-    ++NumNonEmpty; // Track density.
+  ++NumEntries;
   *Bucket = Ptr;
   incrementEpoch();
   return std::make_pair(Bucket, true);
@@ -110,8 +111,7 @@ const void *const *SmallPtrSetImplBase::FindBucketFor(const void *Ptr) const {
 /// Grow - Allocate a larger backing store for the buckets and move it over.
 ///
 void SmallPtrSetImplBase::Grow(unsigned NewSize) {
-  const void **OldBuckets = CurArray;
-  const void **OldEnd = EndPointer();
+  auto OldBuckets = buckets();
   bool WasSmall = isSmall();
 
   // Install the new array.  Clear all the buckets to empty.
@@ -123,16 +123,14 @@ void SmallPtrSetImplBase::Grow(unsigned NewSize) {
   memset(CurArray, -1, NewSize*sizeof(void*));
 
   // Copy over all valid entries.
-  for (const void **BucketPtr = OldBuckets; BucketPtr != OldEnd; ++BucketPtr) {
+  for (const void *&Bucket : OldBuckets) {
     // Copy over the element if it is valid.
-    const void *Elt = *BucketPtr;
-    if (Elt != getTombstoneMarker() && Elt != getEmptyMarker())
-      *const_cast<void**>(FindBucketFor(Elt)) = const_cast<void*>(Elt);
+    if (Bucket != getTombstoneMarker() && Bucket != getEmptyMarker())
+      *const_cast<void **>(FindBucketFor(Bucket)) = const_cast<void *>(Bucket);
   }
 
   if (!WasSmall)
-    free(OldBuckets);
-  NumNonEmpty -= NumTombstones;
+    free(OldBuckets.begin());
   NumTombstones = 0;
   IsSmall = false;
 }
@@ -193,9 +191,9 @@ void SmallPtrSetImplBase::copyHelper(const SmallPtrSetImplBase &RHS) {
   CurArraySize = RHS.CurArraySize;
 
   // Copy over the contents from the other set
-  std::copy(RHS.CurArray, RHS.EndPointer(), CurArray);
+  llvm::copy(RHS.buckets(), CurArray);
 
-  NumNonEmpty = RHS.NumNonEmpty;
+  NumEntries = RHS.NumEntries;
   NumTombstones = RHS.NumTombstones;
 }
 
@@ -217,7 +215,7 @@ void SmallPtrSetImplBase::moveHelper(const void **SmallStorage,
   if (RHS.isSmall()) {
     // Copy a small RHS rather than moving.
     CurArray = SmallStorage;
-    std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, CurArray);
+    llvm::copy(RHS.small_buckets(), CurArray);
   } else {
     CurArray = RHS.CurArray;
     RHS.CurArray = RHSSmallStorage;
@@ -225,13 +223,13 @@ void SmallPtrSetImplBase::moveHelper(const void **SmallStorage,
 
   // Copy the rest of the trivial members.
   CurArraySize = RHS.CurArraySize;
-  NumNonEmpty = RHS.NumNonEmpty;
+  NumEntries = RHS.NumEntries;
   NumTombstones = RHS.NumTombstones;
   IsSmall = RHS.IsSmall;
 
   // Make the RHS small and empty.
   RHS.CurArraySize = SmallSize;
-  RHS.NumNonEmpty = 0;
+  RHS.NumEntries = 0;
   RHS.NumTombstones = 0;
   RHS.IsSmall = true;
 }
@@ -245,54 +243,42 @@ void SmallPtrSetImplBase::swap(const void **SmallStorage,
   if (!this->isSmall() && !RHS.isSmall()) {
     std::swap(this->CurArray, RHS.CurArray);
     std::swap(this->CurArraySize, RHS.CurArraySize);
-    std::swap(this->NumNonEmpty, RHS.NumNonEmpty);
+    std::swap(this->NumEntries, RHS.NumEntries);
     std::swap(this->NumTombstones, RHS.NumTombstones);
     return;
   }
 
   // FIXME: From here on we assume that both sets have the same small size.
 
-  // If only RHS is small, copy the small elements into LHS and move the pointer
-  // from LHS to RHS.
-  if (!this->isSmall() && RHS.isSmall()) {
-    std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, SmallStorage);
-    std::swap(RHS.CurArraySize, this->CurArraySize);
-    std::swap(this->NumNonEmpty, RHS.NumNonEmpty);
+  // Both a small, just swap the small elements.
+  if (this->isSmall() && RHS.isSmall()) {
+    unsigned MinEntries = std::min(this->NumEntries, RHS.NumEntries);
+    std::swap_ranges(this->CurArray, this->CurArray + MinEntries, RHS.CurArray);
+    if (this->NumEntries > MinEntries) {
+      std::copy(this->CurArray + MinEntries, this->CurArray + this->NumEntries,
+                RHS.CurArray + MinEntries);
+    } else {
+      std::copy(RHS.CurArray + MinEntries, RHS.CurArray + RHS.NumEntries,
+                this->CurArray + MinEntries);
+    }
+    assert(this->CurArraySize == RHS.CurArraySize);
+    std::swap(this->NumEntries, RHS.NumEntries);
     std::swap(this->NumTombstones, RHS.NumTombstones);
-    RHS.CurArray = this->CurArray;
-    RHS.IsSmall = false;
-    this->CurArray = SmallStorage;
-    this->IsSmall = true;
     return;
   }
 
-  // If only LHS is small, copy the small elements into RHS and move the pointer
-  // from RHS to LHS.
-  if (this->isSmall() && !RHS.isSmall()) {
-    std::copy(this->CurArray, this->CurArray + this->NumNonEmpty,
-              RHSSmallStorage);
-    std::swap(RHS.CurArraySize, this->CurArraySize);
-    std::swap(RHS.NumNonEmpty, this->NumNonEmpty);
-    std::swap(RHS.NumTombstones, this->NumTombstones);
-    this->CurArray = RHS.CurArray;
-    this->IsSmall = false;
-    RHS.CurArray = RHSSmallStorage;
-    RHS.IsSmall = true;
-    return;
-  }
-
-  // Both a small, just swap the small elements.
-  assert(this->isSmall() && RHS.isSmall());
-  unsigned MinNonEmpty = std::min(this->NumNonEmpty, RHS.NumNonEmpty);
-  std::swap_ranges(this->CurArray, this->CurArray + MinNonEmpty, RHS.CurArray);
-  if (this->NumNonEmpty > MinNonEmpty) {
-    std::copy(this->CurArray + MinNonEmpty, this->CurArray + this->NumNonEmpty,
-              RHS.CurArray + MinNonEmpty);
-  } else {
-    std::copy(RHS.CurArray + MinNonEmpty, RHS.CurArray + RHS.NumNonEmpty,
-              this->CurArray + MinNonEmpty);
-  }
-  assert(this->CurArraySize == RHS.CurArraySize);
-  std::swap(this->NumNonEmpty, RHS.NumNonEmpty);
-  std::swap(this->NumTombstones, RHS.NumTombstones);
+  // If only one side is small, copy the small elements into the large side and
+  // move the pointer from the large side to the small side.
+  SmallPtrSetImplBase &SmallSide = this->isSmall() ? *this : RHS;
+  SmallPtrSetImplBase &LargeSide = this->isSmall() ? RHS : *this;
+  const void **LargeSideInlineStorage =
+      this->isSmall() ? RHSSmallStorage : SmallStorage;
+  llvm::copy(SmallSide.small_buckets(), LargeSideInlineStorage);
+  std::swap(LargeSide.CurArraySize, SmallSide.CurArraySize);
+  std::swap(LargeSide.NumEntries, SmallSide.NumEntries);
+  std::swap(LargeSide.NumTombstones, SmallSide.NumTombstones);
+  SmallSide.CurArray = LargeSide.CurArray;
+  SmallSide.IsSmall = false;
+  LargeSide.CurArray = LargeSideInlineStorage;
+  LargeSide.IsSmall = true;
 }
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index cc02cae..31fb1e8 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -876,6 +876,12 @@ void mapped_file_region::unmapImpl() {
     ::munmap(Mapping, Size);
 }
 
+std::error_code mapped_file_region::sync() const {
+  if (int Res = ::msync(Mapping, Size, MS_SYNC))
+    return std::error_code(Res, std::generic_category());
+  return std::error_code();
+}
+
 void mapped_file_region::dontNeedImpl() {
   assert(Mode == mapped_file_region::readonly);
   if (!Mapping)
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index fdf9d54..9001c19 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -1006,6 +1006,14 @@ void mapped_file_region::unmapImpl() {
 
 void mapped_file_region::dontNeedImpl() {}
 
+std::error_code mapped_file_region::sync() const {
+  if (!::FlushViewOfFile(Mapping, Size))
+    return mapWindowsError(GetLastError());
+  if (!::FlushFileBuffers(FileHandle))
+    return mapWindowsError(GetLastError());
+  return std::error_code();
+}
+
 int mapped_file_region::alignment() {
   SYSTEM_INFO SysInfo;
   ::GetSystemInfo(&SysInfo);
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 3f318e2..67622a9 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -3064,11 +3064,11 @@ const Init *Record::getValueInit(StringRef FieldName) const {
 }
 
 StringRef Record::getValueAsString(StringRef FieldName) const {
-  std::optional<StringRef> S = getValueAsOptionalString(FieldName);
-  if (!S)
-    PrintFatalError(getLoc(), "Record `" + getName() +
-      "' does not have a field named `" + FieldName + "'!\n");
-  return *S;
+  const Init *I = getValueInit(FieldName);
+  if (const auto *SI = dyn_cast<StringInit>(I))
+    return SI->getValue();
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName +
+                                "' exists but does not have a string value");
 }
 
 std::optional<StringRef>
@@ -3088,24 +3088,16 @@ Record::getValueAsOptionalString(StringRef FieldName) const {
 }
 
 const BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
-  const RecordVal *R = getValue(FieldName);
-  if (!R || !R->getValue())
-    PrintFatalError(getLoc(), "Record `" + getName() +
-      "' does not have a field named `" + FieldName + "'!\n");
-
-  if (const auto *BI = dyn_cast<BitsInit>(R->getValue()))
+  const Init *I = getValueInit(FieldName);
+  if (const auto *BI = dyn_cast<BitsInit>(I))
     return BI;
   PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName +
                                 "' exists but does not have a bits value");
 }
 
 const ListInit *Record::getValueAsListInit(StringRef FieldName) const {
-  const RecordVal *R = getValue(FieldName);
-  if (!R || !R->getValue())
-    PrintFatalError(getLoc(), "Record `" + getName() +
-      "' does not have a field named `" + FieldName + "'!\n");
-
-  if (const auto *LI = dyn_cast<ListInit>(R->getValue()))
+  const Init *I = getValueInit(FieldName);
+  if (const auto *LI = dyn_cast<ListInit>(I))
     return LI;
   PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName +
                                 "' exists but does not have a list value");
@@ -3127,17 +3119,13 @@ Record::getValueAsListOfDefs(StringRef FieldName) const {
 }
 
 int64_t Record::getValueAsInt(StringRef FieldName) const {
-  const RecordVal *R = getValue(FieldName);
-  if (!R || !R->getValue())
-    PrintFatalError(getLoc(), "Record `" + getName() +
-      "' does not have a field named `" + FieldName + "'!\n");
-
-  if (const auto *II = dyn_cast<IntInit>(R->getValue()))
+  const Init *I = getValueInit(FieldName);
+  if (const auto *II = dyn_cast<IntInit>(I))
     return II->getValue();
-  PrintFatalError(getLoc(), Twine("Record `") + getName() + "', field `" +
-                                FieldName +
-                                "' exists but does not have an int value: " +
-                                R->getValue()->getAsString());
+  PrintFatalError(
+      getLoc(),
+      Twine("Record `") + getName() + "', field `" + FieldName +
+          "' exists but does not have an int value: " + I->getAsString());
 }
 
 std::vector<int64_t>
@@ -3173,67 +3161,47 @@ Record::getValueAsListOfStrings(StringRef FieldName) const {
 }
 
 const Record *Record::getValueAsDef(StringRef FieldName) const {
-  const RecordVal *R = getValue(FieldName);
-  if (!R || !R->getValue())
-    PrintFatalError(getLoc(), "Record `" + getName() +
-      "' does not have a field named `" + FieldName + "'!\n");
-
-  if (const auto *DI = dyn_cast<DefInit>(R->getValue()))
+  const Init *I = getValueInit(FieldName);
+  if (const auto *DI = dyn_cast<DefInit>(I))
     return DI->getDef();
   PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
     FieldName + "' does not have a def initializer!");
 }
 
 const Record *Record::getValueAsOptionalDef(StringRef FieldName) const {
-  const RecordVal *R = getValue(FieldName);
-  if (!R || !R->getValue())
-    PrintFatalError(getLoc(), "Record `" + getName() +
-      "' does not have a field named `" + FieldName + "'!\n");
-
-  if (const auto *DI = dyn_cast<DefInit>(R->getValue()))
+  const Init *I = getValueInit(FieldName);
+  if (const auto *DI = dyn_cast<DefInit>(I))
     return DI->getDef();
-  if (isa<UnsetInit>(R->getValue()))
+  if (isa<UnsetInit>(I))
     return nullptr;
   PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
     FieldName + "' does not have either a def initializer or '?'!");
 }
 
 bool Record::getValueAsBit(StringRef FieldName) const {
-  const RecordVal *R = getValue(FieldName);
-  if (!R || !R->getValue())
-    PrintFatalError(getLoc(), "Record `" + getName() +
-      "' does not have a field named `" + FieldName + "'!\n");
-
-  if (const auto *BI = dyn_cast<BitInit>(R->getValue()))
+  const Init *I = getValueInit(FieldName);
+  if (const auto *BI = dyn_cast<BitInit>(I))
     return BI->getValue();
   PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
     FieldName + "' does not have a bit initializer!");
 }
 
 bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const {
-  const RecordVal *R = getValue(FieldName);
-  if (!R || !R->getValue())
-    PrintFatalError(getLoc(), "Record `" + getName() +
-      "' does not have a field named `" + FieldName.str() + "'!\n");
-
-  if (isa<UnsetInit>(R->getValue())) {
+  const Init *I = getValueInit(FieldName);
+  if (isa<UnsetInit>(I)) {
     Unset = true;
     return false;
   }
   Unset = false;
-  if (const auto *BI = dyn_cast<BitInit>(R->getValue()))
+  if (const auto *BI = dyn_cast<BitInit>(I))
     return BI->getValue();
   PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
     FieldName + "' does not have a bit initializer!");
 }
 
 const DagInit *Record::getValueAsDag(StringRef FieldName) const {
-  const RecordVal *R = getValue(FieldName);
-  if (!R || !R->getValue())
-    PrintFatalError(getLoc(), "Record `" + getName() +
-      "' does not have a field named `" + FieldName + "'!\n");
-
-  if (const auto *DI = dyn_cast<DagInit>(R->getValue()))
+  const Init *I = getValueInit(FieldName);
+  if (const auto *DI = dyn_cast<DagInit>(I))
     return DI;
   PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
     FieldName + "' does not have a dag initializer!");
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 5496ebd..8d0ff41 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -60,6 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 FunctionPass *createAArch64CollectLOHPass();
 FunctionPass *createSMEABIPass();
 FunctionPass *createSMEPeepholeOptPass();
+FunctionPass *createMachineSMEABIPass();
 ModulePass *createSVEIntrinsicOptsPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
@@ -111,6 +112,7 @@ void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 void initializeSMEABIPass(PassRegistry &);
 void initializeSMEPeepholeOptPass(PassRegistry &);
+void initializeMachineSMEABIPass(PassRegistry &);
 void initializeSVEIntrinsicOptsPass(PassRegistry &);
 void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &);
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index ad8368e..1169f26 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -316,6 +316,12 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
                         ThunkArgTranslation::PointerIndirection};
   };
 
+  if (T->isHalfTy()) {
+    // Prefix with `llvm` since MSVC doesn't specify `_Float16`
+    Out << "__llvm_h__";
+    return direct(T);
+  }
+
   if (T->isFloatTy()) {
     Out << "f";
     return direct(T);
@@ -327,8 +333,8 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
   }
 
   if (T->isFloatingPointTy()) {
-    report_fatal_error(
-        "Only 32 and 64 bit floating points are supported for ARM64EC thunks");
+    report_fatal_error("Only 16, 32, and 64 bit floating points are supported "
+                       "for ARM64EC thunks");
   }
 
   auto &DL = M->getDataLayout();
@@ -342,8 +348,16 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
     uint64_t ElementCnt = T->getArrayNumElements();
     uint64_t ElementSizePerBytes = DL.getTypeSizeInBits(ElementTy) / 8;
     uint64_t TotalSizeBytes = ElementCnt * ElementSizePerBytes;
-    if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) {
-      Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes;
+    if (ElementTy->isHalfTy() || ElementTy->isFloatTy() ||
+        ElementTy->isDoubleTy()) {
+      if (ElementTy->isHalfTy())
+        // Prefix with `llvm` since MSVC doesn't specify `_Float16`
+        Out << "__llvm_H__";
+      else if (ElementTy->isFloatTy())
+        Out << "F";
+      else if (ElementTy->isDoubleTy())
+        Out << "D";
+      Out << TotalSizeBytes;
       if (Alignment.value() >= 16 && !Ret)
         Out << "a" << Alignment.value();
       if (TotalSizeBytes <= 8) {
@@ -355,8 +369,9 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
         return pointerIndirection(T);
       }
     } else if (T->isFloatingPointTy()) {
-      report_fatal_error("Only 32 and 64 bit floating points are supported for "
-                         "ARM64EC thunks");
+      report_fatal_error(
+          "Only 16, 32, and 64 bit floating points are supported "
+          "for ARM64EC thunks");
     }
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index 787a1a8..cc46159 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -75,8 +75,10 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
     auto &It = PendingMembers[0];
     CCAssignFn *AssignFn =
         TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false);
+    // FIXME: Get the correct original type.
+    Type *OrigTy = EVT(It.getValVT()).getTypeForEVT(State.getContext());
     if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full,
-                 ArgFlags, State))
+                 ArgFlags, OrigTy, State))
       llvm_unreachable("Call operand has unhandled type");
 
     // Return the flags to how they were before.
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/llvm/lib/Target/AArch64/AArch64CallingConvention.h
index 63185a9..7105fa6 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.h
@@ -18,52 +18,63 @@
 namespace llvm {
 bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                      CCState &State);
+                      Type *OrigTy, CCState &State);
 bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                                CCValAssign::LocInfo LocInfo,
-                               ISD::ArgFlagsTy ArgFlags, CCState &State);
+                               ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                               CCState &State);
 bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT,
                               CCValAssign::LocInfo LocInfo,
-                              ISD::ArgFlagsTy ArgFlags, CCState &State);
+                              ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                              CCState &State);
 bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT,
                                      CCValAssign::LocInfo LocInfo,
-                                     ISD::ArgFlagsTy ArgFlags, CCState &State);
+                                     ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                     CCState &State);
 bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                                  CCValAssign::LocInfo LocInfo,
-                                 ISD::ArgFlagsTy ArgFlags, CCState &State);
+                                 ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                 CCState &State);
 bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                           CCValAssign::LocInfo LocInfo,
-                          ISD::ArgFlagsTy ArgFlags, CCState &State);
+                          ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                          CCState &State);
 bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                          CCValAssign::LocInfo LocInfo,
-                          ISD::ArgFlagsTy ArgFlags, CCState &State);
+                                       CCValAssign::LocInfo LocInfo,
+                                       ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                       CCState &State);
 bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                          CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                         CCState &State);
+                         Type *OrigTy, CCState &State);
 bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                              CCValAssign::LocInfo LocInfo,
-                             ISD::ArgFlagsTy ArgFlags, CCState &State);
+                             ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                             CCState &State);
 bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
                                     CCValAssign::LocInfo LocInfo,
-                                    ISD::ArgFlagsTy ArgFlags, CCState &State);
+                                    ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                    CCState &State);
 bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
                                       CCValAssign::LocInfo LocInfo,
-                                      ISD::ArgFlagsTy ArgFlags, CCState &State);
+                                      ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                      CCState &State);
 bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                    CCState &State);
+                    Type *OrigTy, CCState &State);
 bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT,
                               CCValAssign::LocInfo LocInfo,
-                              ISD::ArgFlagsTy ArgFlags, CCState &State);
+                              ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                              CCState &State);
 bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                          CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                         CCState &State);
+                         Type *OrigTy, CCState &State);
 bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT,
                                  CCValAssign::LocInfo LocInfo,
-                                 ISD::ArgFlagsTy ArgFlags, CCState &State);
+                                 ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                 CCState &State);
 bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
                                          CCValAssign::LocInfo LocInfo,
-                                         ISD::ArgFlagsTy ArgFlags,
+                                         ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
                                          CCState &State);
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 201bfe0..57dcd68 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -92,8 +92,9 @@ private:
   bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
   bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI);
-  MachineBasicBlock *expandRestoreZA(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI);
+  MachineBasicBlock *
+  expandCommitOrRestoreZASave(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI);
   MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI);
 };
@@ -528,6 +529,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
       UseRev = true;
     }
     break;
+  case AArch64::Destructive2xRegImmUnpred:
+    // EXT_ZZI_CONSTRUCTIVE Zd, Zs, Imm
+    //      ==> MOVPRFX Zd Zs; EXT_ZZI Zd, Zd, Zs, Imm
+    std::tie(DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 1, 2);
+    break;
   default:
     llvm_unreachable("Unsupported Destructive Operand type");
   }
@@ -548,6 +554,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
     break;
   case AArch64::DestructiveUnaryPassthru:
   case AArch64::DestructiveBinaryImm:
+  case AArch64::Destructive2xRegImmUnpred:
     DOPRegIsUnique = true;
     break;
   case AArch64::DestructiveTernaryCommWithRev:
@@ -674,6 +681,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
         .add(MI.getOperand(SrcIdx))
         .add(MI.getOperand(Src2Idx));
     break;
+  case AArch64::Destructive2xRegImmUnpred:
+    DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
+        .add(MI.getOperand(SrcIdx))
+        .add(MI.getOperand(Src2Idx));
+    break;
   }
 
   if (PRFX) {
@@ -979,10 +991,15 @@ bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext(
   return true;
 }
 
-MachineBasicBlock *
-AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI) {
+static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111;
+
+MachineBasicBlock *AArch64ExpandPseudo::expandCommitOrRestoreZASave(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
   MachineInstr &MI = *MBBI;
+  bool IsRestoreZA = MI.getOpcode() == AArch64::RestoreZAPseudo;
+  assert((MI.getOpcode() == AArch64::RestoreZAPseudo ||
+          MI.getOpcode() == AArch64::CommitZASavePseudo) &&
+         "Expected ZA commit or restore");
   assert((std::next(MBBI) != MBB.end() ||
           MI.getParent()->successors().begin() !=
               MI.getParent()->successors().end()) &&
@@ -990,21 +1007,23 @@ AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB,
 
   // Compare TPIDR2_EL0 value against 0.
   DebugLoc DL = MI.getDebugLoc();
-  MachineInstrBuilder Cbz = BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBZX))
-                                .add(MI.getOperand(0));
+  MachineInstrBuilder Branch =
+      BuildMI(MBB, MBBI, DL,
+              TII->get(IsRestoreZA ? AArch64::CBZX : AArch64::CBNZX))
+          .add(MI.getOperand(0));
 
   // Split MBB and create two new blocks:
   //  - MBB now contains all instructions before RestoreZAPseudo.
-  //  - SMBB contains the RestoreZAPseudo instruction only.
-  //  - EndBB contains all instructions after RestoreZAPseudo.
+  //  - SMBB contains the [Commit|RestoreZA]Pseudo instruction only.
+  //  - EndBB contains all instructions after [Commit|RestoreZA]Pseudo.
   MachineInstr &PrevMI = *std::prev(MBBI);
   MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true);
   MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end()
                                  ? *SMBB->successors().begin()
                                  : SMBB->splitAt(MI, /*UpdateLiveIns*/ true);
 
-  // Add the SMBB label to the TB[N]Z instruction & create a branch to EndBB.
-  Cbz.addMBB(SMBB);
+  // Add the SMBB label to the CB[N]Z instruction & create a branch to EndBB.
+  Branch.addMBB(SMBB);
   BuildMI(&MBB, DL, TII->get(AArch64::B))
       .addMBB(EndBB);
   MBB.addSuccessor(EndBB);
@@ -1012,11 +1031,30 @@ AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB,
   // Replace the pseudo with a call (BL).
   MachineInstrBuilder MIB =
       BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::BL));
-  MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit);
+  // Copy operands (mainly the regmask) from the pseudo.
   for (unsigned I = 2; I < MI.getNumOperands(); ++I)
     MIB.add(MI.getOperand(I));
-  BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
 
+  if (IsRestoreZA) {
+    // Mark the TPIDR2 block pointer (X0) as an implicit use.
+    MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit);
+  } else /*CommitZA*/ {
+    [[maybe_unused]] auto *TRI =
+        MBB.getParent()->getSubtarget().getRegisterInfo();
+    // Clear TPIDR2_EL0.
+    BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::MSR))
+        .addImm(AArch64SysReg::TPIDR2_EL0)
+        .addReg(AArch64::XZR);
+    bool ZeroZA = MI.getOperand(1).getImm() != 0;
+    if (ZeroZA) {
+      assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!");
+      BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::ZERO_M))
+          .addImm(ZERO_ALL_ZA_MASK)
+          .addDef(AArch64::ZAB0, RegState::ImplicitDefine);
+    }
+  }
+
+  BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
   MI.eraseFromParent();
   return EndBB;
 }
@@ -1236,14 +1274,20 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                 .add(MI.getOperand(3));
         transferImpOps(MI, I, I);
       } else {
+        unsigned RegState =
+            getRenamableRegState(MI.getOperand(1).isRenamable()) |
+            getKillRegState(
+                MI.getOperand(1).isKill() &&
+                MI.getOperand(1).getReg() != MI.getOperand(2).getReg() &&
+                MI.getOperand(1).getReg() != MI.getOperand(3).getReg());
         BuildMI(MBB, MBBI, MI.getDebugLoc(),
                 TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
                                                     : AArch64::ORRv16i8))
             .addReg(DstReg,
                     RegState::Define |
                         getRenamableRegState(MI.getOperand(0).isRenamable()))
-            .add(MI.getOperand(1))
-            .add(MI.getOperand(1));
+            .addReg(MI.getOperand(1).getReg(), RegState)
+            .addReg(MI.getOperand(1).getReg(), RegState);
         auto I2 =
             BuildMI(MBB, MBBI, MI.getDebugLoc(),
                     TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
@@ -1629,8 +1673,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      return expandCALL_BTI(MBB, MBBI);
    case AArch64::StoreSwiftAsyncContext:
      return expandStoreSwiftAsyncContext(MBB, MBBI);
+   case AArch64::CommitZASavePseudo:
    case AArch64::RestoreZAPseudo: {
-     auto *NewMBB = expandRestoreZA(MBB, MBBI);
+     auto *NewMBB = expandCommitOrRestoreZASave(MBB, MBBI);
      if (NewMBB != &MBB)
        NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
      return true;
@@ -1641,6 +1686,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
        NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
      return true;
    }
+   case AArch64::InOutZAUsePseudo:
+   case AArch64::RequiresZASavePseudo:
    case AArch64::COALESCER_BARRIER_FPR16:
    case AArch64::COALESCER_BARRIER_FPR32:
    case AArch64::COALESCER_BARRIER_FPR64:
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 9d74bb5..cf34498 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -267,7 +267,7 @@ private:
 private:
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
   bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
-                       unsigned &NumBytes);
+                       SmallVectorImpl<Type *> &OrigTys, unsigned &NumBytes);
   bool finishCall(CallLoweringInfo &CLI, unsigned NumBytes);
 
 public:
@@ -3011,11 +3011,13 @@ bool AArch64FastISel::fastLowerArguments() {
 
 bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
                                       SmallVectorImpl<MVT> &OutVTs,
+                                      SmallVectorImpl<Type *> &OrigTys,
                                       unsigned &NumBytes) {
   CallingConv::ID CC = CLI.CallConv;
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
-  CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
+  CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, OrigTys,
+                             CCAssignFnForCall(CC));
 
   // Get a count of how many bytes are to be pushed on the stack.
   NumBytes = CCInfo.getStackSize();
@@ -3194,6 +3196,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // Set up the argument vectors.
   SmallVector<MVT, 16> OutVTs;
+  SmallVector<Type *, 16> OrigTys;
   OutVTs.reserve(CLI.OutVals.size());
 
   for (auto *Val : CLI.OutVals) {
@@ -3207,6 +3210,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       return false;
 
     OutVTs.push_back(VT);
+    OrigTys.push_back(Val->getType());
   }
 
   Address Addr;
@@ -3222,7 +3226,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // Handle the arguments now that we've gotten them.
   unsigned NumBytes;
-  if (!processCallArgs(CLI, OutVTs, NumBytes))
+  if (!processCallArgs(CLI, OutVTs, OrigTys, NumBytes))
     return false;
 
   const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
@@ -3574,12 +3578,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     Args.reserve(II->arg_size());
 
     // Populate the argument list.
-    for (auto &Arg : II->args()) {
-      ArgListEntry Entry;
-      Entry.Val = Arg;
-      Entry.Ty = Arg->getType();
-      Args.push_back(Entry);
-    }
+    for (auto &Arg : II->args())
+      Args.emplace_back(Arg);
 
     CallLoweringInfo CLI;
     MCContext &Ctx = MF->getContext();
@@ -4870,12 +4870,8 @@ bool AArch64FastISel::selectFRem(const Instruction *I) {
   Args.reserve(I->getNumOperands());
 
   // Populate the argument list.
-  for (auto &Arg : I->operands()) {
-    ArgListEntry Entry;
-    Entry.Val = Arg;
-    Entry.Ty = Arg->getType();
-    Args.push_back(Entry);
-  }
+  for (auto &Arg : I->operands())
+    Args.emplace_back(Arg);
 
   CallLoweringInfo CLI;
   MCContext &Ctx = MF->getContext();
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 885f2a9..fddde66 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1475,24 +1475,26 @@ static bool requiresSaveVG(const MachineFunction &MF) {
   return true;
 }
 
-bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
+static bool matchLibcall(const TargetLowering &TLI, const MachineOperand &MO,
+                         RTLIB::Libcall LC) {
+  return MO.isSymbol() &&
+         StringRef(TLI.getLibcallName(LC)) == MO.getSymbolName();
+}
+
+bool isVGInstruction(MachineBasicBlock::iterator MBBI,
+                     const TargetLowering &TLI) {
   unsigned Opc = MBBI->getOpcode();
   if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
       Opc == AArch64::UBFMXri)
     return true;
 
-  if (requiresGetVGCall(*MBBI->getMF())) {
-    if (Opc == AArch64::ORRXrr)
-      return true;
+  if (!requiresGetVGCall(*MBBI->getMF()))
+    return false;
 
-    if (Opc == AArch64::BL) {
-      auto Op1 = MBBI->getOperand(0);
-      return Op1.isSymbol() &&
-             (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
-    }
-  }
+  if (Opc == AArch64::BL)
+    return matchLibcall(TLI, MBBI->getOperand(0), RTLIB::SMEABI_GET_CURRENT_VG);
 
-  return false;
+  return Opc == AArch64::ORRXrr;
 }
 
 // Convert callee-save register save/restore instruction to do stack pointer
@@ -1511,9 +1513,11 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
   // functions, we need to do this for both the streaming and non-streaming
   // vector length. Move past these instructions if necessary.
   MachineFunction &MF = *MBB.getParent();
-  if (requiresSaveVG(MF))
-    while (isVGInstruction(MBBI))
+  if (requiresSaveVG(MF)) {
+    auto &TLI = *MF.getSubtarget().getTargetLowering();
+    while (isVGInstruction(MBBI, TLI))
       ++MBBI;
+  }
 
   switch (MBBI->getOpcode()) {
   default:
@@ -2097,11 +2101,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // Move past the saves of the callee-saved registers, fixing up the offsets
   // and pre-inc if we decided to combine the callee-save and local stack
   // pointer bump above.
+  auto &TLI = *MF.getSubtarget().getTargetLowering();
   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
          !IsSVECalleeSave(MBBI)) {
     if (CombineSPBump &&
         // Only fix-up frame-setup load/store instructions.
-        (!requiresSaveVG(MF) || !isVGInstruction(MBBI)))
+        (!requiresSaveVG(MF) || !isVGInstruction(MBBI, TLI)))
       fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
                                         NeedsWinCFI, &HasWinCFI);
     ++MBBI;
@@ -3468,6 +3473,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
+  auto &TLI = *MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool NeedsWinCFI = needsWinCFI(MF);
@@ -3581,11 +3587,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
               .addReg(AArch64::X0, RegState::Implicit)
               .setMIFlag(MachineInstr::FrameSetup);
 
-        const uint32_t *RegMask = TRI->getCallPreservedMask(
-            MF,
-            CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
+        RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG;
+        const uint32_t *RegMask =
+            TRI->getCallPreservedMask(MF, TLI.getLibcallCallingConv(LC));
         BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
-            .addExternalSymbol("__arm_get_current_vg")
+            .addExternalSymbol(TLI.getLibcallName(LC))
             .addRegMask(RegMask)
             .addReg(AArch64::X0, RegState::ImplicitDefine)
             .setMIFlag(MachineInstr::FrameSetup);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ad42f4b..bc786f4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -7617,16 +7617,29 @@ bool AArch64DAGToDAGISel::SelectAnyPredicate(SDValue N) {
 bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned MaxSize,
                                              SDValue &Base, SDValue &Offset,
                                              unsigned Scale) {
-  // Try to untangle an ADD node into a 'reg + offset'
-  if (CurDAG->isBaseWithConstantOffset(N))
-    if (auto C = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+  auto MatchConstantOffset = [&](SDValue CN) -> SDValue {
+    if (auto *C = dyn_cast<ConstantSDNode>(CN)) {
       int64_t ImmOff = C->getSExtValue();
-      if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0))) {
-        Base = N.getOperand(0);
-        Offset = CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64);
-        return true;
-      }
+      if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0)))
+        return CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64);
     }
+    return SDValue();
+  };
+
+  if (SDValue C = MatchConstantOffset(N)) {
+    Base = CurDAG->getConstant(0, SDLoc(N), MVT::i32);
+    Offset = C;
+    return true;
+  }
+
+  // Try to untangle an ADD node into a 'reg + offset'
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (SDValue C = MatchConstantOffset(N.getOperand(1))) {
+      Base = N.getOperand(0);
+      Offset = C;
+      return true;
+    }
+  }
 
   // By default, just match reg + 0.
   Base = N;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3c06c6a..e896370 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17,6 +17,7 @@
 #include "AArch64PerfectShuffle.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "Utils/AArch64SMEAttributes.h"
@@ -1998,6 +1999,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(Op, MVT::f16, Promote);
 }
 
+const AArch64TargetMachine &AArch64TargetLowering::getTM() const {
+  return static_cast<const AArch64TargetMachine &>(getTargetMachine());
+}
+
 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   assert(VT.isVector() && "VT should be a vector type");
 
@@ -3083,13 +3088,12 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   if (FuncInfo->isSMESaveBufferUsed()) {
+    RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
     const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
-        .addExternalSymbol("__arm_sme_state_size")
+        .addExternalSymbol(getLibcallName(LC))
         .addReg(AArch64::X0, RegState::ImplicitDefine)
-        .addRegMask(TRI->getCallPreservedMask(
-            *MF, CallingConv::
-                     AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+        .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
             MI.getOperand(0).getReg())
         .addReg(AArch64::X0);
@@ -3101,6 +3105,30 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,
+                                         MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  Register ResultReg = MI.getOperand(0).getReg();
+  if (FuncInfo->isPStateSMRegUsed()) {
+    RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
+    const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
+        .addExternalSymbol(getLibcallName(LC))
+        .addReg(AArch64::X0, RegState::ImplicitDefine)
+        .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg)
+        .addReg(AArch64::X0);
+  } else {
+    assert(MI.getMF()->getRegInfo().use_empty(ResultReg) &&
+           "Expected no users of the entry pstate.sm!");
+  }
+  MI.eraseFromParent();
+  return BB;
+}
+
 // Helper function to find the instruction that defined a virtual register.
 // If unable to find such instruction, returns nullptr.
 static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI,
@@ -3216,6 +3244,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     return EmitAllocateSMESaveBuffer(MI, BB);
   case AArch64::GetSMESaveSize:
     return EmitGetSMESaveSize(MI, BB);
+  case AArch64::EntryPStateSM:
+    return EmitEntryPStateSM(MI, BB);
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
   case TargetOpcode::STATEPOINT:
@@ -3320,7 +3350,8 @@ static bool isZerosVector(const SDNode *N) {
 
 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
 /// CC
-static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC,
+                                                  SDValue RHS = {}) {
   switch (CC) {
   default:
     llvm_unreachable("Unknown condition code!");
@@ -3331,9 +3362,9 @@ static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
   case ISD::SETGT:
     return AArch64CC::GT;
   case ISD::SETGE:
-    return AArch64CC::GE;
+    return (RHS && isNullConstant(RHS)) ? AArch64CC::PL : AArch64CC::GE;
   case ISD::SETLT:
-    return AArch64CC::LT;
+    return (RHS && isNullConstant(RHS)) ? AArch64CC::MI : AArch64CC::LT;
   case ISD::SETLE:
     return AArch64CC::LE;
   case ISD::SETUGT:
@@ -3492,6 +3523,13 @@ bool isLegalCmpImmed(APInt C) {
   return isLegalArithImmed(C.abs().getZExtValue());
 }
 
+unsigned numberOfInstrToLoadImm(APInt C) {
+  uint64_t Imm = C.getZExtValue();
+  SmallVector<AArch64_IMM::ImmInsnModel> Insn;
+  AArch64_IMM::expandMOVImm(Imm, 32, Insn);
+  return Insn.size();
+}
+
 static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG) {
   // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
   if (Op->getFlags().hasNoSignedWrap())
@@ -3782,7 +3820,7 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
     SDLoc DL(Val);
     // Determine OutCC and handle FP special case.
     if (isInteger) {
-      OutCC = changeIntCCToAArch64CC(CC);
+      OutCC = changeIntCCToAArch64CC(CC, RHS);
     } else {
       assert(LHS.getValueType().isFloatingPoint());
       AArch64CC::CondCode ExtraCC;
@@ -3961,6 +3999,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       // CC has already been adjusted.
       RHS = DAG.getConstant(0, DL, VT);
     } else if (!isLegalCmpImmed(C)) {
+      unsigned NumImmForC = numberOfInstrToLoadImm(C);
       // Constant does not fit, try adjusting it by one?
       switch (CC) {
       default:
@@ -3969,43 +4008,49 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       case ISD::SETGE:
         if (!C.isMinSignedValue()) {
           APInt CMinusOne = C - 1;
-          if (isLegalCmpImmed(CMinusOne)) {
+          if (isLegalCmpImmed(CMinusOne) ||
+              (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
             CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
             RHS = DAG.getConstant(CMinusOne, DL, VT);
           }
         }
         break;
       case ISD::SETULT:
-      case ISD::SETUGE:
-        if (!C.isZero()) {
-          APInt CMinusOne = C - 1;
-          if (isLegalCmpImmed(CMinusOne)) {
-            CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
-            RHS = DAG.getConstant(CMinusOne, DL, VT);
-          }
+      case ISD::SETUGE: {
+        // C is not 0 because it is a legal immediate.
+        assert(!C.isZero() && "C should not be zero here");
+        APInt CMinusOne = C - 1;
+        if (isLegalCmpImmed(CMinusOne) ||
+            (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          RHS = DAG.getConstant(CMinusOne, DL, VT);
         }
         break;
+      }
       case ISD::SETLE:
       case ISD::SETGT:
         if (!C.isMaxSignedValue()) {
           APInt CPlusOne = C + 1;
-          if (isLegalCmpImmed(CPlusOne)) {
+          if (isLegalCmpImmed(CPlusOne) ||
+              (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
             CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
             RHS = DAG.getConstant(CPlusOne, DL, VT);
           }
         }
         break;
       case ISD::SETULE:
-      case ISD::SETUGT:
+      case ISD::SETUGT: {
         if (!C.isAllOnes()) {
           APInt CPlusOne = C + 1;
-          if (isLegalCmpImmed(CPlusOne)) {
+          if (isLegalCmpImmed(CPlusOne) ||
+              (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
             CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
             RHS = DAG.getConstant(CPlusOne, DL, VT);
           }
         }
         break;
       }
+      }
     }
   }
 
@@ -4079,7 +4124,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 
   if (!Cmp) {
     Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
-    AArch64CC = changeIntCCToAArch64CC(CC);
+    AArch64CC = changeIntCCToAArch64CC(CC, RHS);
   }
   AArch64cc = getCondCode(DAG, AArch64CC);
   return Cmp;
@@ -5174,13 +5219,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
   ArgListTy Args;
-  ArgListEntry Entry;
-
-  Entry.Node = Arg;
-  Entry.Ty = ArgTy;
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
-  Args.push_back(Entry);
+  Args.emplace_back(Arg, ArgTy);
 
   RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
                                         : RTLIB::SINCOS_STRET_F32;
@@ -5711,15 +5750,15 @@ static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
 SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
                                                   SDValue Chain, SDLoc DL,
                                                   EVT VT) const {
-  SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
+  RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
   Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
   Type *RetTy = StructType::get(Int64Ty, Int64Ty);
   TargetLowering::CallLoweringInfo CLI(DAG);
   ArgListTy Args;
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
-      RetTy, Callee, std::move(Args));
+      getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
   return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
@@ -7886,8 +7925,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       else if (ActualMVT == MVT::i16)
         ValVT = MVT::i16;
     }
-    bool Res =
-        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+    bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
+                        Ins[i].OrigTy, CCInfo);
     assert(!Res && "Call operand has unhandled type");
     (void)Res;
   }
@@ -8132,19 +8171,26 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   }
   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
 
+  if (Attrs.hasStreamingCompatibleInterface()) {
+    SDValue EntryPStateSM =
+        DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
+                    DAG.getVTList(MVT::i64, MVT::Other), {Chain});
+
+    // Copy the value to a virtual register, and save that in FuncInfo.
+    Register EntryPStateSMReg =
+        MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
+                             EntryPStateSM);
+    FuncInfo->setPStateSMReg(EntryPStateSMReg);
+  }
+
   // Insert the SMSTART if this is a locally streaming function and
   // make sure it is Glued to the last CopyFromReg value.
   if (IsLocallyStreaming) {
-    SDValue PStateSM;
-    if (Attrs.hasStreamingCompatibleInterface()) {
-      PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
-      Register Reg = MF.getRegInfo().createVirtualRegister(
-          getRegClassFor(PStateSM.getValueType().getSimpleVT()));
-      FuncInfo->setPStateSMReg(Reg);
-      Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
+    if (Attrs.hasStreamingCompatibleInterface())
       Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
-                                  AArch64SME::IfCallerIsNonStreaming, PStateSM);
-    } else
+                                  AArch64SME::IfCallerIsNonStreaming);
+    else
       Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
                                   AArch64SME::Always);
 
@@ -8244,53 +8290,54 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   if (Subtarget->hasCustomCallingConv())
     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
 
-  // Create a 16 Byte TPIDR2 object. The dynamic buffer
-  // will be expanded and stored in the static object later using a pseudonode.
-  if (Attrs.hasZAState()) {
-    TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
-    TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
-    SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
-                              DAG.getConstant(1, DL, MVT::i32));
-
-    SDValue Buffer;
-    if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
-      Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
-                           DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
-    } else {
-      SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
-      Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
-                           DAG.getVTList(MVT::i64, MVT::Other),
-                           {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
-      MFI.CreateVariableSizedObject(Align(16), nullptr);
-    }
-    Chain = DAG.getNode(
-        AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
-        {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
-  } else if (Attrs.hasAgnosticZAInterface()) {
-    // Call __arm_sme_state_size().
-    SDValue BufferSize =
-        DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
-                    DAG.getVTList(MVT::i64, MVT::Other), Chain);
-    Chain = BufferSize.getValue(1);
-
-    SDValue Buffer;
-    if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
-      Buffer =
-          DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
-                      DAG.getVTList(MVT::i64, MVT::Other), {Chain, BufferSize});
-    } else {
-      // Allocate space dynamically.
-      Buffer = DAG.getNode(
-          ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
-          {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
-      MFI.CreateVariableSizedObject(Align(16), nullptr);
+  if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
+    // Old SME ABI lowering (deprecated):
+    // Create a 16 Byte TPIDR2 object. The dynamic buffer
+    // will be expanded and stored in the static object later using a
+    // pseudonode.
+    if (Attrs.hasZAState()) {
+      TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
+      TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
+      SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                                DAG.getConstant(1, DL, MVT::i32));
+      SDValue Buffer;
+      if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
+        Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
+                             DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
+      } else {
+        SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+        Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
+                             DAG.getVTList(MVT::i64, MVT::Other),
+                             {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
+        MFI.CreateVariableSizedObject(Align(16), nullptr);
+      }
+      Chain = DAG.getNode(
+          AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
+          {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
+    } else if (Attrs.hasAgnosticZAInterface()) {
+      // Call __arm_sme_state_size().
+      SDValue BufferSize =
+          DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
+                      DAG.getVTList(MVT::i64, MVT::Other), Chain);
+      Chain = BufferSize.getValue(1);
+      SDValue Buffer;
+      if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
+        Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
+                             DAG.getVTList(MVT::i64, MVT::Other),
+                             {Chain, BufferSize});
+      } else {
+        // Allocate space dynamically.
+        Buffer = DAG.getNode(
+            ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
+            {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
+        MFI.CreateVariableSizedObject(Align(16), nullptr);
+      }
+      // Copy the value to a virtual register, and save that in FuncInfo.
+      Register BufferPtr =
+          MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+      FuncInfo->setSMESaveBufferAddr(BufferPtr);
+      Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
     }
-
-    // Copy the value to a virtual register, and save that in FuncInfo.
-    Register BufferPtr =
-        MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
-    FuncInfo->setSMESaveBufferAddr(BufferPtr);
-    Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
   }
 
   if (CallConv == CallingConv::PreserveNone) {
@@ -8307,6 +8354,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     }
   }
 
+  if (getTM().useNewSMEABILowering()) {
+    // Clear new ZT0 state. TODO: Move this to the SME ABI pass.
+    if (Attrs.isNewZT0())
+      Chain = DAG.getNode(
+          ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
+          DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
+          DAG.getTargetConstant(0, DL, MVT::i32));
+  }
+
   return Chain;
 }
 
@@ -8557,19 +8613,20 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
     // FIXME: CCAssignFnForCall should be called once, for the call and not per
     // argument. This logic should exactly mirror LowerFormalArguments.
     CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
-    bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+    bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
+                        Outs[i].OrigTy, CCInfo);
     assert(!Res && "Call operand has unhandled type");
     (void)Res;
   }
 }
 
 static SMECallAttrs
-getSMECallAttrs(const Function &Caller,
+getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI,
                 const TargetLowering::CallLoweringInfo &CLI) {
   if (CLI.CB)
-    return SMECallAttrs(*CLI.CB);
+    return SMECallAttrs(*CLI.CB, &TLI);
   if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
-    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol()));
+    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
   return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));
 }
 
@@ -8591,7 +8648,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
 
   // SME Streaming functions are not eligible for TCO as they may require
   // the streaming mode or ZA to be restored after returning from the call.
-  SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, CLI);
+  SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
   if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
       CallAttrs.requiresPreservingAllZAState() ||
       CallAttrs.caller().hasStreamingBody())
@@ -8834,8 +8891,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
 SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
                                                    bool Enable, SDValue Chain,
                                                    SDValue InGlue,
-                                                   unsigned Condition,
-                                                   SDValue PStateSM) const {
+                                                   unsigned Condition) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   FuncInfo->setHasStreamingModeChanges(true);
@@ -8847,9 +8903,16 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
   SmallVector<SDValue> Ops = {Chain, MSROp};
   unsigned Opcode;
   if (Condition != AArch64SME::Always) {
+    FuncInfo->setPStateSMRegUsed(true);
+    Register PStateReg = FuncInfo->getPStateSMReg();
+    assert(PStateReg.isValid() && "PStateSM Register is invalid");
+    SDValue PStateSM =
+        DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
+    // Use chain and glue from the CopyFromReg.
+    Ops[0] = PStateSM.getValue(1);
+    InGlue = PStateSM.getValue(2);
     SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
     Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
-    assert(PStateSM && "PStateSM should be defined");
     Ops.push_back(ConditionOp);
     Ops.push_back(PStateSM);
   } else {
@@ -8871,22 +8934,19 @@ static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   FuncInfo->setSMESaveBufferUsed();
-
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Ty = PointerType::getUnqual(*DAG.getContext());
-  Entry.Node =
-      DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64);
-  Args.push_back(Entry);
-
-  SDValue Callee =
-      DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore",
-                            TLI.getPointerTy(DAG.getDataLayout()));
+  Args.emplace_back(
+      DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
+      PointerType::getUnqual(*DAG.getContext()));
+
+  RTLIB::Libcall LC =
+      IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
   auto *RetTy = Type::getVoidTy(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, RetTy,
-      Callee, std::move(Args));
+      TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
   return TLI.LowerCallTo(CLI).second;
 }
 
@@ -9014,14 +9074,28 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       CallConv = CallingConv::AArch64_SVE_VectorCall;
   }
 
+  // Determine whether we need any streaming mode changes.
+  SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+  bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
+  bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
+  auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
+    // TODO: Handle agnostic ZA functions.
+    if (!UseNewSMEABILowering || IsAgnosticZAFunction)
+      return std::nullopt;
+    if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
+      return std::nullopt;
+    return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
+                                        : AArch64ISD::INOUT_ZA_USE;
+  }();
+
   if (IsTailCall) {
     // Check if it's really possible to do a tail call.
     IsTailCall = isEligibleForTailCallOptimization(CLI);
 
     // A sibling call is one where we're under the usual C ABI and not planning
     // to change that but can still do a tail call:
-    if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
-        CallConv != CallingConv::SwiftTail)
+    if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
+        CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
       IsSibCall = true;
 
     if (IsTailCall)
@@ -9073,9 +9147,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
   }
 
-  // Determine whether we need any streaming mode changes.
-  SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), CLI);
-
   auto DescribeCallsite =
       [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
     R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
@@ -9089,7 +9160,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     return R;
   };
 
-  bool RequiresLazySave = CallAttrs.requiresLazySave();
+  bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
   bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
   if (RequiresLazySave) {
     const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
@@ -9124,15 +9195,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                     /*IsSave=*/true);
   }
 
-  SDValue PStateSM;
   bool RequiresSMChange = CallAttrs.requiresSMChange();
   if (RequiresSMChange) {
-    if (CallAttrs.caller().hasStreamingInterfaceOrBody())
-      PStateSM = DAG.getConstant(1, DL, MVT::i64);
-    else if (CallAttrs.caller().hasNonStreamingInterface())
-      PStateSM = DAG.getConstant(0, DL, MVT::i64);
-    else
-      PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
     OptimizationRemarkEmitter ORE(&MF.getFunction());
     ORE.emit([&]() {
       auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
@@ -9171,10 +9235,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
 
-  // Adjust the stack pointer for the new arguments...
+  // Adjust the stack pointer for the new arguments... and mark ZA uses.
   // These operations are automatically eliminated by the prolog/epilog pass
-  if (!IsSibCall)
+  assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
+  if (!IsSibCall) {
     Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
+    if (ZAMarkerNode) {
+      // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
+      // using a chain can result in incorrect scheduling. The markers refer to
+      // the position just before the CALLSEQ_START (though occur after as
+      // CALLSEQ_START lacks in-glue).
+      Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other),
+                          {Chain, Chain.getValue(1)});
+    }
+  }
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
                                         getPointerTy(DAG.getDataLayout()));
@@ -9447,9 +9521,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       InGlue = Chain.getValue(1);
     }
 
-    SDValue NewChain = changeStreamingMode(
-        DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
-        getSMToggleCondition(CallAttrs), PStateSM);
+    SDValue NewChain =
+        changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(),
+                            Chain, InGlue, getSMToggleCondition(CallAttrs));
     Chain = NewChain.getValue(0);
     InGlue = NewChain.getValue(1);
   }
@@ -9633,10 +9707,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     InGlue = Result.getValue(Result->getNumValues() - 1);
 
   if (RequiresSMChange) {
-    assert(PStateSM && "Expected a PStateSM to be set");
     Result = changeStreamingMode(
         DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
-        getSMToggleCondition(CallAttrs), PStateSM);
+        getSMToggleCondition(CallAttrs));
 
     if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
       InGlue = Result.getValue(1);
@@ -9646,7 +9719,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
   }
 
-  if (CallAttrs.requiresEnablingZAAfterCall())
+  if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
     // Unconditionally resume ZA.
     Result = DAG.getNode(
         AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
@@ -9659,15 +9732,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (RequiresLazySave) {
     // Conditionally restore the lazy save using a pseudo node.
+    RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
     TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
     SDValue RegMask = DAG.getRegisterMask(
-        TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
+        TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
     SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
-        "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
+        getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
     SDValue TPIDR2_EL0 = DAG.getNode(
         ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
         DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
-
     // Copy the address of the TPIDR2 block into X0 before 'calling' the
     // RESTORE_ZA pseudo.
     SDValue Glue;
@@ -9679,7 +9752,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
                     {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
                      RestoreRoutine, RegMask, Result.getValue(1)});
-
     // Finally reset the TPIDR2_EL0 register to 0.
     Result = DAG.getNode(
         ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
@@ -9802,14 +9874,11 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   // Emit SMSTOP before returning from a locally streaming function
   SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
   if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
-    if (FuncAttrs.hasStreamingCompatibleInterface()) {
-      Register Reg = FuncInfo->getPStateSMReg();
-      assert(Reg.isValid() && "PStateSM Register is invalid");
-      SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
+    if (FuncAttrs.hasStreamingCompatibleInterface())
       Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
                                   /*Glue*/ SDValue(),
-                                  AArch64SME::IfCallerIsNonStreaming, PStateSM);
-    } else
+                                  AArch64SME::IfCallerIsNonStreaming);
+    else
       Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
                                   /*Glue*/ SDValue(), AArch64SME::Always);
     Glue = Chain.getValue(1);
@@ -17359,7 +17428,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool AArch64TargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -17369,7 +17438,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   auto *LI = dyn_cast<LoadInst>(Load);
   if (!LI)
     return false;
-  assert(!Mask && "Unexpected mask on a load");
+  assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
 
   const DataLayout &DL = LI->getDataLayout();
 
@@ -28194,6 +28263,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
     case Intrinsic::aarch64_sme_in_streaming_mode: {
       SDLoc DL(N);
       SDValue Chain = DAG.getEntryNode();
+
       SDValue RuntimePStateSM =
           getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
       Results.push_back(
@@ -29004,7 +29074,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
 
   // Checks to allow the use of SME instructions
   if (auto *Base = dyn_cast<CallBase>(&Inst)) {
-    auto CallAttrs = SMECallAttrs(*Base);
+    auto CallAttrs = SMECallAttrs(*Base, this);
     if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
         CallAttrs.requiresPreservingZT0() ||
         CallAttrs.requiresPreservingAllZAState())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 8887657..071e96e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -23,6 +23,8 @@
 
 namespace llvm {
 
+class AArch64TargetMachine;
+
 namespace AArch64 {
 /// Possible values of current rounding mode, which is specified in bits
 /// 23:22 of FPCR.
@@ -64,6 +66,8 @@ public:
   explicit AArch64TargetLowering(const TargetMachine &TM,
                                  const AArch64Subtarget &STI);
 
+  const AArch64TargetMachine &getTM() const;
+
   /// Control the following reassociation of operands: (op (op x, c1), y) -> (op
   /// (op x, y), c1) where N0 is (op x, c1) and N1 is y.
   bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
@@ -173,6 +177,10 @@ public:
   MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
                                  unsigned Opcode, bool Op0IsDef) const;
   MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+
+  // Note: The following group of functions are only used as part of the old SME
+  // ABI lowering. They will be removed once -aarch64-new-sme-abi=true is the
+  // default.
   MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI,
@@ -181,6 +189,8 @@ public:
                                                MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI,
                                         MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitEntryPStateSM(MachineInstr &MI,
+                                       MachineBasicBlock *BB) const;
 
   /// Replace (0, vreg) discriminator components with the operands of blend
   /// or with (immediate, NoRegister) when possible.
@@ -220,8 +230,8 @@ public:
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
-                            ArrayRef<unsigned> Indices,
-                            unsigned Factor) const override;
+                            ArrayRef<unsigned> Indices, unsigned Factor,
+                            const APInt &GapMask) const override;
   bool lowerInterleavedStore(Instruction *Store, Value *Mask,
                              ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
@@ -523,8 +533,8 @@ public:
   /// node. \p Condition should be one of the enum values from
   /// AArch64SME::ToggleCondition.
   SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
-                              SDValue Chain, SDValue InGlue, unsigned Condition,
-                              SDValue PStateSM = SDValue()) const;
+                              SDValue Chain, SDValue InGlue,
+                              unsigned Condition) const;
 
   bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index d068a12..178dab6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -36,7 +36,12 @@ def DestructiveBinary             : DestructiveInstTypeEnum<5>;
 def DestructiveBinaryComm         : DestructiveInstTypeEnum<6>;
 def DestructiveBinaryCommWithRev  : DestructiveInstTypeEnum<7>;
 def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>;
-def DestructiveUnaryPassthru      : DestructiveInstTypeEnum<9>;
+
+// 3 inputs unpredicated (reg1, reg2, imm).
+// Can be MOVPRFX'd iff reg1 == reg2.
+def Destructive2xRegImmUnpred     : DestructiveInstTypeEnum<9>;
+
+def DestructiveUnaryPassthru      : DestructiveInstTypeEnum<10>;
 
 class FalseLanesEnum<bits<2> val> {
   bits<2> Value = val;
@@ -7362,7 +7367,9 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
       [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
   def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
                                             V128, V128, V128,
-                                            asm#"2", ".8h", ".16b", ".16b", []>;
+                                            asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
+                                      (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm)))))]>;
   let Predicates = [HasAES] in {
     def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
                                               V128, V64, V64,
@@ -7374,10 +7381,6 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
         [(set (v16i8 V128:$Rd), (OpNode (extract_high_v2i64 (v2i64 V128:$Rn)),
                                         (extract_high_v2i64 (v2i64 V128:$Rm))))]>;
   }
-
-  def : Pat<(v8i16 (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
-                          (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))),
-      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
 }
 
 multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
@@ -7402,87 +7405,7 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
                                       (extract_high_v4i32 (v4i32 V128:$Rm))))]>;
 }
 
-multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
-                                (extract_high_v16i8 (v16i8 V128:$Rm))))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
-                                  (extract_high_v8i16 (v8i16 V128:$Rm))))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
-                                 (extract_high_v4i32 (v4i32 V128:$Rm))))))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
-                                          string asm,
-                                          SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
-                                   (extract_high_v16i8 (v16i8 V128:$Rm)))))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
-                                    (extract_high_v8i16 (v8i16 V128:$Rm)))))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
-                                    (extract_high_v4i32 (v4i32 V128:$Rm)))))))]>;
-}
-
+let isCommutable = 1 in
 multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
                                   SDPatternOperator OpNode = null_frag> {
   def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index fb59c9f..d15f90d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,7 +20,9 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CFIInstBuilder.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -83,6 +85,11 @@ static cl::opt<unsigned>
     BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
                       cl::desc("Restrict range of B instructions (DEBUG)"));
 
+static cl::opt<unsigned> GatherOptSearchLimit(
+    "aarch64-search-limit", cl::Hidden, cl::init(2048),
+    cl::desc("Restrict range of instructions to search for the "
+             "machine-combiner gather pattern optimization"));
+
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
                           AArch64::CATCHRET),
@@ -5078,8 +5085,13 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
         MCRegister DestRegX = TRI->getMatchingSuperReg(
             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
-        MCRegister SrcRegX = TRI->getMatchingSuperReg(
-            SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
+        assert(DestRegX.isValid() && "Destination super-reg not valid");
+        MCRegister SrcRegX =
+            SrcReg == AArch64::WZR
+                ? AArch64::XZR
+                : TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                           &AArch64::GPR64spRegClass);
+        assert(SrcRegX.isValid() && "Source super-reg not valid");
         // This instruction is reading and writing X registers.  This may upset
         // the register scavenger and machine verifier, so we need to indicate
         // that we are reading an undefined value from SrcRegX, but a proper
@@ -5920,7 +5932,7 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
   // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
   SmallString<64> Expr;
   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
-  assert(DwarfReg >= 0 && DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
+  assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
   // Reg + NumBytes
   Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
   appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
@@ -7412,11 +7424,319 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
+/// Check if the given instruction forms a gather load pattern that can be
+/// optimized for better Memory-Level Parallelism (MLP). This function
+/// identifies chains of NEON lane load instructions that load data from
+/// different memory addresses into individual lanes of a 128-bit vector
+/// register, then attempts to split the pattern into parallel loads to break
+/// the serial dependency between instructions.
+///
+/// Pattern Matched:
+///   Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
+///   LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
+///
+/// Transformed Into:
+///   Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
+///   to combine the results, enabling better memory-level parallelism.
+///
+/// Supported Element Types:
+///   - 32-bit elements (LD1i32, 4 lanes total)
+///   - 16-bit elements (LD1i16, 8 lanes total)
+///   - 8-bit elements (LD1i8, 16 lanes total)
+static bool getGatherLanePattern(MachineInstr &Root,
+                                 SmallVectorImpl<unsigned> &Patterns,
+                                 unsigned LoadLaneOpCode, unsigned NumLanes) {
+  const MachineFunction *MF = Root.getMF();
+
+  // Early exit if optimizing for size.
+  if (MF->getFunction().hasMinSize())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  // The root of the pattern must load into the last lane of the vector.
+  if (Root.getOperand(2).getImm() != NumLanes - 1)
+    return false;
+
+  // Check that we have load into all lanes except lane 0.
+  // For each load we also want to check that:
+  // 1. It has a single non-debug use (since we will be replacing the virtual
+  // register)
+  // 2. That the addressing mode only uses a single pointer operand
+  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+  auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+  SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
+  SmallVector<const MachineInstr *, 16> LoadInstrs;
+  while (!RemainingLanes.empty() && CurrInstr &&
+         CurrInstr->getOpcode() == LoadLaneOpCode &&
+         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+         CurrInstr->getNumOperands() == 4) {
+    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+    LoadInstrs.push_back(CurrInstr);
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  // Check that we have found a match for lanes N-1.. 1.
+  if (!RemainingLanes.empty())
+    return false;
+
+  // Match the SUBREG_TO_REG sequence.
+  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+    return false;
+
+  // Verify that the subreg to reg loads an integer into the first lane.
+  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+  unsigned SingleLaneSizeInBits = 128 / NumLanes;
+  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+    return false;
+
+  // Verify that it also has a single non debug use.
+  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+    return false;
+
+  LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
+
+  // If there is any chance of aliasing, do not apply the pattern.
+  // Walk backward through the MBB starting from Root.
+  // Exit early if we've encountered all load instructions or hit the search
+  // limit.
+  auto MBBItr = Root.getIterator();
+  unsigned RemainingSteps = GatherOptSearchLimit;
+  SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
+  RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
+  const MachineBasicBlock *MBB = Root.getParent();
+
+  for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
+         !RemainingLoadInstrs.empty();
+       --MBBItr, --RemainingSteps) {
+    const MachineInstr &CurrInstr = *MBBItr;
+
+    // Remove this instruction from remaining loads if it's one we're tracking.
+    RemainingLoadInstrs.erase(&CurrInstr);
+
+    // Check for potential aliasing with any of the load instructions to
+    // optimize.
+    if (CurrInstr.isLoadFoldBarrier())
+      return false;
+  }
+
+  // If we hit the search limit without finding all load instructions,
+  // don't match the pattern.
+  if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
+    return false;
+
+  switch (NumLanes) {
+  case 4:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+    break;
+  case 8:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+    break;
+  case 16:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+    break;
+  default:
+    llvm_unreachable("Got bad number of lanes for gather pattern.");
+  }
+
+  return true;
+}
+
+/// Search for patterns of LD instructions we can optimize.
+static bool getLoadPatterns(MachineInstr &Root,
+                            SmallVectorImpl<unsigned> &Patterns) {
+
+  // The pattern searches for loads into single lanes.
+  switch (Root.getOpcode()) {
+  case AArch64::LD1i32:
+    return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
+  case AArch64::LD1i16:
+    return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
+  case AArch64::LD1i8:
+    return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
+  default:
+    return false;
+  }
+}
+
+/// Generate optimized instruction sequence for gather load patterns to improve
+/// Memory-Level Parallelism (MLP). This function transforms a chain of
+/// sequential NEON lane loads into parallel vector loads that can execute
+/// concurrently.
+static void
+generateGatherLanePattern(MachineInstr &Root,
+                          SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          SmallVectorImpl<MachineInstr *> &DelInstrs,
+                          DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+                          unsigned Pattern, unsigned NumLanes) {
+  MachineFunction &MF = *Root.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  // Gather the initial load instructions to build the pattern.
+  SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+  MachineInstr *CurrInstr = &Root;
+  for (unsigned i = 0; i < NumLanes - 1; ++i) {
+    LoadToLaneInstrs.push_back(CurrInstr);
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  // Sort the load instructions according to the lane.
+  llvm::sort(LoadToLaneInstrs,
+             [](const MachineInstr *A, const MachineInstr *B) {
+               return A->getOperand(2).getImm() > B->getOperand(2).getImm();
+             });
+
+  MachineInstr *SubregToReg = CurrInstr;
+  LoadToLaneInstrs.push_back(
+      MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+  auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+  const TargetRegisterClass *FPR128RegClass =
+      MRI.getRegClass(Root.getOperand(0).getReg());
+
+  // Helper lambda to create a LD1 instruction.
+  auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
+                                  Register SrcRegister, unsigned Lane,
+                                  Register OffsetRegister,
+                                  bool OffsetRegisterKillState) {
+    auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+    MachineInstrBuilder LoadIndexIntoRegister =
+        BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+                NewRegister)
+            .addReg(SrcRegister)
+            .addImm(Lane)
+            .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
+    InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+    InsInstrs.push_back(LoadIndexIntoRegister);
+    return NewRegister;
+  };
+
+  // Helper to create load instruction based on the NumLanes in the NEON
+  // register we are rewriting.
+  auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
+                                  Register OffsetReg,
+                                  bool KillState) -> MachineInstrBuilder {
+    unsigned Opcode;
+    switch (NumLanes) {
+    case 4:
+      Opcode = AArch64::LDRSui;
+      break;
+    case 8:
+      Opcode = AArch64::LDRHui;
+      break;
+    case 16:
+      Opcode = AArch64::LDRBui;
+      break;
+    default:
+      llvm_unreachable(
+          "Got unsupported number of lanes in machine-combiner gather pattern");
+    }
+    // Immediate offset load
+    return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+        .addReg(OffsetReg)
+        .addImm(0);
+  };
+
+  // Load the remaining lanes into register 0.
+  auto LanesToLoadToReg0 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+                       LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+  Register PrevReg = SubregToReg->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+    const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
+    PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
+                                   OffsetRegOperand.getReg(),
+                                   OffsetRegOperand.isKill());
+    DelInstrs.push_back(LoadInstr);
+  }
+  Register LastLoadReg0 = PrevReg;
+
+  // First load into register 1. Perform an integer load to zero out the upper
+  // lanes in a single instruction.
+  MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
+  MachineInstr *OriginalSplitLoad =
+      *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+  Register DestRegForMiddleIndex = MRI.createVirtualRegister(
+      MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+  const MachineOperand &OriginalSplitToLoadOffsetOperand =
+      OriginalSplitLoad->getOperand(3);
+  MachineInstrBuilder MiddleIndexLoadInstr =
+      CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
+                           OriginalSplitToLoadOffsetOperand.getReg(),
+                           OriginalSplitToLoadOffsetOperand.isKill());
+
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+  InsInstrs.push_back(MiddleIndexLoadInstr);
+  DelInstrs.push_back(OriginalSplitLoad);
+
+  // Subreg To Reg instruction for register 1.
+  Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+  unsigned SubregType;
+  switch (NumLanes) {
+  case 4:
+    SubregType = AArch64::ssub;
+    break;
+  case 8:
+    SubregType = AArch64::hsub;
+    break;
+  case 16:
+    SubregType = AArch64::bsub;
+    break;
+  default:
+    llvm_unreachable(
+        "Got invalid NumLanes for machine-combiner gather pattern");
+  }
+
+  auto SubRegToRegInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+              DestRegForSubregToReg)
+          .addImm(0)
+          .addReg(DestRegForMiddleIndex, getKillRegState(true))
+          .addImm(SubregType);
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+  InsInstrs.push_back(SubRegToRegInstr);
+
+  // Load remaining lanes into register 1.
+  auto LanesToLoadToReg1 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+                       LoadToLaneInstrsAscending.end());
+  PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+    const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
+    PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
+                                   OffsetRegOperand.getReg(),
+                                   OffsetRegOperand.isKill());
+
+    // Do not add the last reg to DelInstrs - it will be removed later.
+    if (Index == NumLanes / 2 - 2) {
+      break;
+    }
+    DelInstrs.push_back(LoadInstr);
+  }
+  Register LastLoadReg1 = PrevReg;
+
+  // Create the final zip instruction to combine the results.
+  MachineInstrBuilder ZipInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+              Root.getOperand(0).getReg())
+          .addReg(LastLoadReg0)
+          .addReg(LastLoadReg1);
+  InsInstrs.push_back(ZipInstr);
+}
+
 CombinerObjective
 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7446,6 +7766,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMiscPatterns(Root, Patterns))
     return true;
 
+  // Load patterns
+  if (getLoadPatterns(Root, Patterns))
+    return true;
+
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
                                                      DoRegPressureReduce);
 }
@@ -8701,6 +9025,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
+    generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                              Pattern, 4);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
+    generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                              Pattern, 8);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
+    generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                              Pattern, 16);
+    break;
+  }
 
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da..70c814a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned {
   FMULv8i16_indexed_OP2,
 
   FNMADD,
+
+  GATHER_LANE_i32,
+  GATHER_LANE_i16,
+  GATHER_LANE_i8
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
@@ -820,7 +824,8 @@ enum DestructiveInstType {
   DestructiveBinaryComm         = TSFLAG_DESTRUCTIVE_INST_TYPE(0x6),
   DestructiveBinaryCommWithRev  = TSFLAG_DESTRUCTIVE_INST_TYPE(0x7),
   DestructiveTernaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x8),
-  DestructiveUnaryPassthru      = TSFLAG_DESTRUCTIVE_INST_TYPE(0x9),
+  Destructive2xRegImmUnpred     = TSFLAG_DESTRUCTIVE_INST_TYPE(0x9),
+  DestructiveUnaryPassthru      = TSFLAG_DESTRUCTIVE_INST_TYPE(0xa),
 };
 
 enum FalseLaneType {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ac31236..4fa91a4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5707,27 +5707,6 @@ let Predicates = [HasFullFP16] in {
 // Advanced SIMD two vector instructions.
 //===----------------------------------------------------------------------===//
 
-defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>;
-// Match UABDL in log2-shuffle patterns.
-def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
-                           (zext (v8i8 V64:$opB))))),
-          (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))),
-                           (zext (extract_high_v16i8 (v16i8 V128:$opB)))))),
-          (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
-def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
-                           (zext (v4i16 V64:$opB))))),
-          (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 (v8i16 V128:$opA))),
-                           (zext (extract_high_v8i16 (v8i16 V128:$opB)))))),
-          (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
-def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
-                           (zext (v2i32 V64:$opB))))),
-          (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 (v4i32 V128:$opA))),
-                           (zext (extract_high_v4i32 (v4i32 V128:$opB)))))),
-          (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
-
 defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
 defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
 defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
@@ -6055,6 +6034,7 @@ defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
 defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
 
 defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+let isCommutable = 1 in
 defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
 defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
       TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >;
@@ -6802,40 +6782,47 @@ def : Pat <(f64 (uint_to_fp (i32
 // Advanced SIMD three different-sized vector instructions.
 //===----------------------------------------------------------------------===//
 
-defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
-defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
-defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
-defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
-defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
-defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>;
-defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>;
+defm ADDHN   : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN   : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN  : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN  : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+let isCommutable = 1 in
+defm PMULL   : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
+defm SABAL   : SIMDLongThreeVectorTiedBHS<0,0b0101,"sabal",
+                 TriOpFrag<(add node:$LHS, (zext (abds node:$MHS, node:$RHS)))>>;
+defm SABDL   : SIMDLongThreeVectorBHS<0, 0b0111, "sabdl",
+                 BinOpFrag<(zext (abds node:$LHS, node:$RHS))>>;
 defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
-            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+                BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
 defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
                  BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
 defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
-    TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+                 TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
 defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
-    TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+                 TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
 defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>;
 defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>;
 defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>;
-defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
-                                     int_aarch64_neon_sqdmull>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", int_aarch64_neon_sqdmull>;
+let isCommutable = 0 in
 defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
                  BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
 defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
                  BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>;
+defm UABAL   : SIMDLongThreeVectorTiedBHS<1, 0b0101, "uabal",
+                 TriOpFrag<(add node:$LHS, (zext (abdu node:$MHS, node:$RHS)))>>;
+defm UABDL   : SIMDLongThreeVectorBHS<1, 0b0111, "uabdl",
+                 BinOpFrag<(zext (abdu node:$LHS, node:$RHS))>>;
 defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
                  BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>;
 defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
                  BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>;
 defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
-    TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+                 TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
 defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
-    TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+                 TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
 defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>;
+let isCommutable = 0 in
 defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
                  BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
 defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 782d62a7..e69fa32 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1193,7 +1193,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
       //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
       //   STRW %w0
       Register Reg = getLdStRegOp(*I).getReg();
-      for (MachineInstr &MI : make_range(std::next(I), Paired))
+      for (MachineInstr &MI :
+           make_range(std::next(I->getIterator()), Paired->getIterator()))
         MI.clearRegisterKills(Reg, TRI);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 800787c..ed3374a 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -213,9 +213,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// or return type
   bool IsSVECC = false;
 
-  /// The frame-index for the TPIDR2 object used for lazy saves.
-  TPIDR2Object TPIDR2;
-
   /// Whether this function changes streaming mode within the function.
   bool HasStreamingModeChanges = false;
 
@@ -231,13 +228,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   // on function entry to record the initial pstate of a function.
   Register PStateSMReg = MCRegister::NoRegister;
 
-  // Holds a pointer to a buffer that is large enough to represent
-  // all SME ZA state and any additional state required by the
-  // __arm_sme_save/restore support routines.
-  Register SMESaveBufferAddr = MCRegister::NoRegister;
-
-  // true if SMESaveBufferAddr is used.
-  bool SMESaveBufferUsed = false;
+  // true if PStateSMReg is used.
+  bool PStateSMRegUsed = false;
 
   // Has the PNReg used to build PTRUE instruction.
   // The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
@@ -250,6 +242,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   // Holds the SME function attributes (streaming mode, ZA/ZT0 state).
   SMEAttrs SMEFnAttrs;
 
+  // Note: The following properties are only used for the old SME ABI lowering:
+  /// The frame-index for the TPIDR2 object used for lazy saves.
+  TPIDR2Object TPIDR2;
+  // Holds a pointer to a buffer that is large enough to represent
+  // all SME ZA state and any additional state required by the
+  // __arm_sme_save/restore support routines.
+  Register SMESaveBufferAddr = MCRegister::NoRegister;
+  // true if SMESaveBufferAddr is used.
+  bool SMESaveBufferUsed = false;
+
 public:
   AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
 
@@ -258,6 +260,13 @@ public:
         const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
       const override;
 
+  // Old SME ABI lowering state getters/setters:
+  Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
+  void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
+  unsigned isSMESaveBufferUsed() const { return SMESaveBufferUsed; };
+  void setSMESaveBufferUsed(bool Used = true) { SMESaveBufferUsed = Used; };
+  TPIDR2Object &getTPIDR2Obj() { return TPIDR2; }
+
   void setPredicateRegForFillSpill(unsigned Reg) {
     PredicateRegForFillSpill = Reg;
   }
@@ -265,15 +274,12 @@ public:
     return PredicateRegForFillSpill;
   }
 
-  Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
-  void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
-
-  unsigned isSMESaveBufferUsed() const { return SMESaveBufferUsed; };
-  void setSMESaveBufferUsed(bool Used = true) { SMESaveBufferUsed = Used; };
-
   Register getPStateSMReg() const { return PStateSMReg; };
   void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
 
+  unsigned isPStateSMRegUsed() const { return PStateSMRegUsed; };
+  void setPStateSMRegUsed(bool Used = true) { PStateSMRegUsed = Used; };
+
   int64_t getVGIdx() const { return VGIdx; };
   void setVGIdx(unsigned Idx) { VGIdx = Idx; };
 
@@ -283,8 +289,6 @@ public:
   bool isSVECC() const { return IsSVECC; };
   void setIsSVECC(bool s) { IsSVECC = s; };
 
-  TPIDR2Object &getTPIDR2Obj() { return TPIDR2; }
-
   void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
 
   unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 1bc1d98..42eaeca 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -321,7 +321,6 @@ def TuneAppleA7  : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
                                     FeatureFuseAES, FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCRegMoveFPR64,
                                     FeatureZCZeroing,
                                     FeatureZCZeroingFPWorkaround]>;
 
@@ -335,7 +334,6 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCRegMoveFPR64,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
@@ -348,7 +346,6 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCRegMoveFPR64,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
@@ -361,7 +358,6 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCRegMoveFPR64,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
@@ -374,7 +370,6 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCRegMoveFPR64,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
@@ -392,7 +387,6 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCRegMoveFPR64,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
@@ -410,7 +404,6 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCRegMoveFPR64,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
@@ -428,7 +421,6 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCRegMoveFPR64,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
@@ -446,7 +438,6 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCRegMoveFPR64,
                                     FeatureZCZeroing]>;
 
 def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
@@ -463,7 +454,6 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      FeatureFuseCryptoEOR,
                                      FeatureFuseLiterals,
                                      FeatureZCRegMoveGPR64,
-                                     FeatureZCRegMoveFPR64,
                                      FeatureZCZeroing
                                      ]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index db27ca9..5c4e0c1 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -39,12 +39,25 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
 def AArch64CoalescerBarrier
     : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
 
+def AArch64EntryPStateSM
+  : SDNode<"AArch64ISD::ENTRY_PSTATE_SM", SDTypeProfile<1, 0,
+    [SDTCisInt<0>]>, [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in {
+  def EntryPStateSM : Pseudo<(outs GPR64:$is_streaming), (ins), []>, Sched<[]> {}
+}
+def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>;
+
 def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
                            [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
 def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
                               [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
+//===----------------------------------------------------------------------===//
+// Old SME ABI lowering ISD nodes/pseudos (deprecated)
+//===----------------------------------------------------------------------===//
+
 def AArch64AllocateZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1,
                               [SDTCisInt<0>, SDTCisInt<1>]>,
                               [SDNPHasChain, SDNPSideEffect]>;
@@ -78,6 +91,30 @@ def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)),
           (AllocateSMESaveBuffer $size)>;
 
 //===----------------------------------------------------------------------===//
+// New SME ABI lowering ISD nodes/pseudos (-aarch64-new-sme-abi)
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1, isMeta = 1 in {
+  def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+}
+
+def CommitZASavePseudo
+  : Pseudo<(outs),
+           (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
+           Sched<[]>;
+
+def AArch64_inout_za_use
+  : SDNode<"AArch64ISD::INOUT_ZA_USE", SDTypeProfile<0, 0,[]>,
+           [SDNPHasChain, SDNPInGlue]>;
+def : Pat<(AArch64_inout_za_use), (InOutZAUsePseudo)>;
+
+def AArch64_requires_za_save
+  : SDNode<"AArch64ISD::REQUIRES_ZA_SAVE", SDTypeProfile<0, 0,[]>,
+           [SDNPHasChain, SDNPInGlue]>;
+def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
+
+//===----------------------------------------------------------------------===//
 // Instruction naming conventions.
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 0c4b4f4..509dd8b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1021,7 +1021,9 @@ let Predicates = [HasNonStreamingSVE_or_SME2p2] in {
 let Predicates = [HasSVE_or_SME] in {
   defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
   defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
-  defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
+  defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext, "EXT_ZZI_CONSTRUCTIVE">;
+
+  def EXT_ZZI_CONSTRUCTIVE : UnpredRegImmPseudo<ZPR8, imm0_255>;
 
   defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>;
   defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>;
@@ -2131,21 +2133,37 @@ let Predicates = [HasSVE_or_SME] in {
             (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>;
 
   // Splice with lane bigger or equal to 0
-  foreach VT = [nxv16i8] in
+  foreach VT = [nxv16i8] in {
     def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_255 i32:$index)))),
               (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+    let AddedComplexity = 1 in
+      def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_255 i32:$index)))),
+                (EXT_ZZI_CONSTRUCTIVE  ZPR:$Z1, imm0_255:$index)>;
+  }
 
-  foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
+  foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in {
     def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_127 i32:$index)))),
               (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+    let AddedComplexity = 1 in
+      def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_127 i32:$index)))),
+                (EXT_ZZI_CONSTRUCTIVE  ZPR:$Z1, imm0_255:$index)>;
+  }
 
-  foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
+  foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in {
     def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_63 i32:$index)))),
               (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+    let AddedComplexity = 1 in
+      def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_63 i32:$index)))),
+                (EXT_ZZI_CONSTRUCTIVE  ZPR:$Z1, imm0_255:$index)>;
+  }
 
-  foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
+  foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in {
     def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_31 i32:$index)))),
               (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+    let AddedComplexity = 1 in
+      def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_31 i32:$index)))),
+                (EXT_ZZI_CONSTRUCTIVE  ZPR:$Z1, imm0_255:$index)>;
+  }
 
   defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
   defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA320.td b/llvm/lib/Target/AArch64/AArch64SchedA320.td
index 89ed1338..5ec95c7 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA320.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA320.td
@@ -847,7 +847,7 @@ def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^[SU]XTB_ZPmZ
                                             "^[SU]XTW_ZPmZ_[D]")>;
 
 // Extract
-def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
 
 // Extract narrow saturating
 def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index 9456878..356e3fa 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -825,7 +825,7 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]XTB_ZPmZ
                                             "^[SU]XTW_ZPmZ_[D]")>;
 
 // Extract
-def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
 
 // Extract narrow saturating
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
@@ -1016,7 +1016,7 @@ def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_
 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>;
 
 // Floating point compare
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
+def : InstRW<[CortexA510MCWrite<4, 2, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
                                             "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
                                             "^FCM(LE|LT)_PPzZ0_[HSD]",
                                             "^FCMUO_PPzZZ_[HSD]")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 91a7079..e798222 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -1785,7 +1785,7 @@ def : InstRW<[N2Write_2c_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]",
                                           "^[SU]XTW_ZPmZ_[D]")>;
 
 // Extract
-def : InstRW<[N2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[N2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
 
 // Extract narrow saturating
 def : InstRW<[N2Write_4c_1V1], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]$",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index ecfb124..e44d40f 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -1757,7 +1757,7 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU]XTB_ZPmZ_[HSD]",
                                          "^[SU]XTW_ZPmZ_[D]")>;
 
 // Extract
-def : InstRW<[N3Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[N3Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
 
 // Extract narrow saturating
 def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]$",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index 3686654..44625a2 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -1575,7 +1575,7 @@ def : InstRW<[V1Write_2c_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]",
                                           "^[SU]XTW_ZPmZ_[D]")>;
 
 // Extract
-def : InstRW<[V1Write_2c_1V01], (instrs EXT_ZZI)>;
+def : InstRW<[V1Write_2c_1V01], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE)>;
 
 // Extract/insert operation, SIMD and FP scalar form
 def : InstRW<[V1Write_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]$",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index b2c3da0..6261220 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -2272,7 +2272,7 @@ def : InstRW<[V2Write_2c_1V13], (instregex "^[SU]XTB_ZPmZ_[HSD]",
                                            "^[SU]XTW_ZPmZ_[D]")>;
 
 // Extract
-def : InstRW<[V2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[V2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
 
 // Extract narrow saturating
 def : InstRW<[V2Write_4c_1V13], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 8a5b5ba..d3b1aa6 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -182,37 +182,25 @@ SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall(
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
   const AArch64TargetLowering *TLI = STI.getTargetLowering();
-  TargetLowering::ArgListEntry DstEntry;
-  DstEntry.Ty = PointerType::getUnqual(*DAG.getContext());
-  DstEntry.Node = Dst;
   TargetLowering::ArgListTy Args;
-  Args.push_back(DstEntry);
+  Args.emplace_back(Dst, PointerType::getUnqual(*DAG.getContext()));
 
   RTLIB::Libcall NewLC;
   switch (LC) {
   case RTLIB::MEMCPY: {
     NewLC = RTLIB::SC_MEMCPY;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Ty = PointerType::getUnqual(*DAG.getContext());
-    Entry.Node = Src;
-    Args.push_back(Entry);
+    Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext()));
     break;
   }
   case RTLIB::MEMMOVE: {
     NewLC = RTLIB::SC_MEMMOVE;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Ty = PointerType::getUnqual(*DAG.getContext());
-    Entry.Node = Src;
-    Args.push_back(Entry);
+    Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext()));
     break;
   }
   case RTLIB::MEMSET: {
     NewLC = RTLIB::SC_MEMSET;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Ty = Type::getInt32Ty(*DAG.getContext());
-    Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32);
-    Entry.Node = Src;
-    Args.push_back(Entry);
+    Args.emplace_back(DAG.getZExtOrTrunc(Src, DL, MVT::i32),
+                      Type::getInt32Ty(*DAG.getContext()));
     break;
   }
   default:
@@ -221,10 +209,7 @@ SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall(
 
   EVT PointerVT = TLI->getPointerTy(DAG.getDataLayout());
   SDValue Symbol = DAG.getExternalSymbol(TLI->getLibcallName(NewLC), PointerVT);
-  TargetLowering::ArgListEntry SizeEntry;
-  SizeEntry.Node = Size;
-  SizeEntry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-  Args.push_back(SizeEntry);
+  Args.emplace_back(Size, DAG.getDataLayout().getIntPtrType(*DAG.getContext()));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   PointerType *RetTy = PointerType::getUnqual(*DAG.getContext());
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index f136a184..a67bd42 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -585,8 +585,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
                                    ClMaxLifetimes);
     if (StandardLifetime) {
       IntrinsicInst *Start = Info.LifetimeStart[0];
-      uint64_t Size =
-          cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
+      uint64_t Size = *Info.AI->getAllocationSize(*DL);
       Size = alignTo(Size, kTagGranuleSize);
       tagAlloca(AI, Start->getNextNode(), TagPCall, Size);
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 95eab16..e67bd58 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -224,6 +224,11 @@ static cl::opt<bool>
                            cl::desc("Enable Machine Pipeliner for AArch64"),
                            cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+    EnableNewSMEABILowering("aarch64-new-sme-abi",
+                            cl::desc("Enable new lowering for the SME ABI"),
+                            cl::init(false), cl::Hidden);
+
 extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
 LLVMInitializeAArch64Target() {
   // Register the target.
@@ -263,6 +268,7 @@ LLVMInitializeAArch64Target() {
   initializeLDTLSCleanupPass(PR);
   initializeKCFIPass(PR);
   initializeSMEABIPass(PR);
+  initializeMachineSMEABIPass(PR);
   initializeSMEPeepholeOptPass(PR);
   initializeSVEIntrinsicOptsPass(PR);
   initializeAArch64SpeculationHardeningPass(PR);
@@ -367,7 +373,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
           computeDefaultCPU(TT, CPU), FS, Options,
           getEffectiveRelocModel(TT, RM),
           getEffectiveAArch64CodeModel(TT, CM, JIT), OL),
-      TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) {
+      TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian),
+      UseNewSMEABILowering(EnableNewSMEABILowering) {
   initAsmInfo();
 
   if (TT.isOSBinFormatMachO()) {
@@ -668,10 +675,12 @@ void AArch64PassConfig::addIRPasses() {
     addPass(createInterleavedAccessPass());
   }
 
-  // Expand any functions marked with SME attributes which require special
-  // changes for the calling convention or that require the lazy-saving
-  // mechanism specified in the SME ABI.
-  addPass(createSMEABIPass());
+  if (!EnableNewSMEABILowering) {
+    // Expand any functions marked with SME attributes which require special
+    // changes for the calling convention or that require the lazy-saving
+    // mechanism specified in the SME ABI.
+    addPass(createSMEABIPass());
+  }
 
   // Add Control Flow Guard checks.
   if (TM->getTargetTriple().isOSWindows()) {
@@ -782,6 +791,9 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
 }
 
 void AArch64PassConfig::addMachineSSAOptimization() {
+  if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None)
+    addPass(createMachineSMEABIPass());
+
   if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
     addPass(createSMEPeepholeOptPass());
 
@@ -812,6 +824,9 @@ bool AArch64PassConfig::addILPOpts() {
 }
 
 void AArch64PassConfig::addPreRegAlloc() {
+  if (TM->getOptLevel() == CodeGenOptLevel::None && EnableNewSMEABILowering)
+    addPass(createMachineSMEABIPass());
+
   // Change dead register definitions to refer to the zero register.
   if (TM->getOptLevel() != CodeGenOptLevel::None &&
       EnableDeadRegisterElimination)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index b9e522d..0dd5d95 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -79,8 +79,12 @@ public:
   size_t clearLinkerOptimizationHints(
       const SmallPtrSetImpl<MachineInstr *> &MIs) const override;
 
+  /// Returns true if the new SME ABI lowering should be used.
+  bool useNewSMEABILowering() const { return UseNewSMEABILowering; }
+
 private:
   bool isLittle;
+  bool UseNewSMEABILowering;
 };
 
 // AArch64 little endian target machine.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f05add..b021968 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -220,20 +220,17 @@ static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
 static cl::opt<bool> EnableScalableAutovecInStreamingMode(
     "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
 
-static bool isSMEABIRoutineCall(const CallInst &CI) {
+static bool isSMEABIRoutineCall(const CallInst &CI,
+                                const AArch64TargetLowering &TLI) {
   const auto *F = CI.getCalledFunction();
-  return F && StringSwitch<bool>(F->getName())
-                  .Case("__arm_sme_state", true)
-                  .Case("__arm_tpidr2_save", true)
-                  .Case("__arm_tpidr2_restore", true)
-                  .Case("__arm_za_disable", true)
-                  .Default(false);
+  return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
 }
 
 /// Returns true if the function has explicit operations that can only be
 /// lowered using incompatible instructions for the selected mode. This also
 /// returns true if the function F may use or modify ZA state.
-static bool hasPossibleIncompatibleOps(const Function *F) {
+static bool hasPossibleIncompatibleOps(const Function *F,
+                                       const AArch64TargetLowering &TLI) {
   for (const BasicBlock &BB : *F) {
     for (const Instruction &I : BB) {
       // Be conservative for now and assume that any call to inline asm or to
@@ -242,7 +239,7 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
       // all native LLVM instructions can be lowered to compatible instructions.
       if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
           (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
-           isSMEABIRoutineCall(cast<CallInst>(I))))
+           isSMEABIRoutineCall(cast<CallInst>(I), TLI)))
         return true;
     }
   }
@@ -290,7 +287,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
   if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
       CallAttrs.requiresPreservingZT0() ||
       CallAttrs.requiresPreservingAllZAState()) {
-    if (hasPossibleIncompatibleOps(Callee))
+    if (hasPossibleIncompatibleOps(Callee, *getTLI()))
       return false;
   }
 
@@ -357,7 +354,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
   // change only once and avoid inlining of G into F.
 
   SMEAttrs FAttrs(*F);
-  SMECallAttrs CallAttrs(Call);
+  SMECallAttrs CallAttrs(Call, getTLI());
 
   if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
     if (F == Call.getCaller()) // (1)
@@ -554,7 +551,17 @@ static bool isUnpackedVectorVT(EVT VecVT) {
          VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
 }
 
-static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
+static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
+                                        const IntrinsicCostAttributes &ICA) {
+  // We need to know at least the number of elements in the vector of buckets
+  // and the size of each element to update.
+  if (ICA.getArgTypes().size() < 2)
+    return InstructionCost::getInvalid();
+
+  // Only interested in costing for the hardware instruction from SVE2.
+  if (!ST->hasSVE2())
+    return InstructionCost::getInvalid();
+
   Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
   Type *EltTy = ICA.getArgTypes()[1];        // Type of bucket elements
   unsigned TotalHistCnts = 1;
@@ -579,9 +586,11 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
 
     unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
     TotalHistCnts = EC / NaturalVectorWidth;
+
+    return InstructionCost(BaseHistCntCost * TotalHistCnts);
   }
 
-  return InstructionCost(BaseHistCntCost * TotalHistCnts);
+  return InstructionCost::getInvalid();
 }
 
 InstructionCost
@@ -597,10 +606,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       return InstructionCost::getInvalid();
 
   switch (ICA.getID()) {
-  case Intrinsic::experimental_vector_histogram_add:
-    if (!ST->hasSVE2())
-      return InstructionCost::getInvalid();
-    return getHistogramCost(ICA);
+  case Intrinsic::experimental_vector_histogram_add: {
+    InstructionCost HistCost = getHistogramCost(ST, ICA);
+    // If the cost isn't valid, we may still be able to scalarize
+    if (HistCost.isValid())
+      return HistCost;
+    break;
+  }
   case Intrinsic::umin:
   case Intrinsic::umax:
   case Intrinsic::smin:
@@ -651,6 +663,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       return LT.first;
     break;
   }
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd: {
+    // Given a fma or fmuladd, cost it the same as a fmul instruction which are
+    // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
+    Type *EltTy = RetTy->getScalarType();
+    if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
+        (EltTy->isHalfTy() && ST->hasFullFP16()))
+      return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
+    break;
+  }
   case Intrinsic::stepvector: {
     InstructionCost Cost = 1; // Cost of the `index' instruction
     auto LT = getTypeLegalizationCost(RetTy);
@@ -3961,6 +3983,24 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
   return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
 }
 
+InstructionCost
+AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                                 TTI::TargetCostKind CostKind,
+                                                 unsigned Index) const {
+  if (isa<FixedVectorType>(Val))
+    return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
+                                                   Index);
+
+  // This typically requires both while and lastb instructions in order
+  // to extract the last element. If this is in a loop the while
+  // instruction can at least be hoisted out, although it will consume a
+  // predicate register. The cost should be more expensive than the base
+  // extract cost, which is 2 for most CPUs.
+  return CostKind == TTI::TCK_CodeSize
+             ? 2
+             : ST->getVectorInsertExtractBaseCost() + 1;
+}
+
 InstructionCost AArch64TTIImpl::getScalarizationOverhead(
     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
     TTI::TargetCostKind CostKind, bool ForPoisonSrc,
@@ -3975,6 +4015,27 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
   return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
 }
 
+std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
+    Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+    TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+    std::function<InstructionCost(Type *)> InstCost) const {
+  if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
+    return std::nullopt;
+  if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
+    return std::nullopt;
+
+  Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
+  InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
+                                          TTI::CastContextHint::None, CostKind);
+  if (!Op1Info.isConstant() && !Op2Info.isConstant())
+    Cost *= 2;
+  Cost += InstCost(PromotedTy);
+  if (IncludeTrunc)
+    Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
+                             TTI::CastContextHint::None, CostKind);
+  return Cost;
+}
+
 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
@@ -3997,6 +4058,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
+  // Increase the cost for half and bfloat types if not architecturally
+  // supported.
+  if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
+      ISD == ISD::FDIV || ISD == ISD::FREM)
+    if (auto PromotedCost = getFP16BF16PromoteCost(
+            Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
+            [&](Type *PromotedTy) {
+              return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
+                                            Op1Info, Op2Info);
+            }))
+      return *PromotedCost;
+
   switch (ISD) {
   default:
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -4265,11 +4338,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     [[fallthrough]];
   case ISD::FADD:
   case ISD::FSUB:
-    // Increase the cost for half and bfloat types if not architecturally
-    // supported.
-    if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
-        (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
-      return 2 * LT.first;
     if (!Ty->getScalarType()->isFP128Ty())
       return LT.first;
     [[fallthrough]];
@@ -4293,8 +4361,9 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
 }
 
 InstructionCost
-AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
-                                          const SCEV *Ptr) const {
+AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                          const SCEV *Ptr,
+                                          TTI::TargetCostKind CostKind) const {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -4302,7 +4371,7 @@ AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
   unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
   int MaxMergeDistance = 64;
 
-  if (Ty->isVectorTy() && SE &&
+  if (PtrTy->isVectorTy() && SE &&
       !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
     return NumVectorInstToHideOverhead;
 
@@ -4371,25 +4440,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
   }
 
   if (Opcode == Instruction::FCmp) {
-    // Without dedicated instructions we promote f16 + bf16 compares to f32.
-    if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) ||
-        ValTy->getScalarType()->isBFloatTy()) {
-      Type *PromotedTy =
-          ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext()));
-      InstructionCost Cost =
-          getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy,
-                           TTI::CastContextHint::None, CostKind);
-      if (!Op1Info.isConstant() && !Op2Info.isConstant())
-        Cost *= 2;
-      Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind,
-                                 Op1Info, Op2Info);
-      if (ValTy->isVectorTy())
-        Cost += getCastInstrCost(
-            Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)),
-            VectorType::getInteger(cast<VectorType>(PromotedTy)),
-            TTI::CastContextHint::None, CostKind);
-      return Cost;
-    }
+    if (auto PromotedCost = getFP16BF16PromoteCost(
+            ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
+            [&](Type *PromotedTy) {
+              InstructionCost Cost =
+                  getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
+                                     CostKind, Op1Info, Op2Info);
+              if (isa<VectorType>(PromotedTy))
+                Cost += getCastInstrCost(
+                    Instruction::Trunc,
+                    VectorType::getInteger(cast<VectorType>(ValTy)),
+                    VectorType::getInteger(cast<VectorType>(PromotedTy)),
+                    TTI::CastContextHint::None, CostKind);
+              return Cost;
+            }))
+      return *PromotedCost;
 
     auto LT = getTypeLegalizationCost(ValTy);
     // Model unknown fp compares as a libcall.
@@ -4858,32 +4923,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
   // Limit to loops with trip counts that are cheap to expand.
   UP.SCEVExpansionBudget = 1;
 
-  // Try to unroll small, single block loops, if they have load/store
-  // dependencies, to expose more parallel memory access streams.
+  // Try to unroll small loops, of few-blocks with low budget, if they have
+  // load/store dependencies, to expose more parallel memory access streams,
+  // or if they do little work inside a block (i.e. load -> X -> store pattern).
   BasicBlock *Header = L->getHeader();
-  if (Header == L->getLoopLatch()) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (Header == Latch) {
     // Estimate the size of the loop.
     unsigned Size;
-    if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
+    unsigned Width = 10;
+    if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
       return;
 
-    SmallPtrSet<Value *, 8> LoadedValues;
-    SmallVector<StoreInst *> Stores;
-    for (auto *BB : L->blocks()) {
-      for (auto &I : *BB) {
-        Value *Ptr = getLoadStorePointerOperand(&I);
-        if (!Ptr)
-          continue;
-        const SCEV *PtrSCEV = SE.getSCEV(Ptr);
-        if (SE.isLoopInvariant(PtrSCEV, L))
-          continue;
-        if (isa<LoadInst>(&I))
-          LoadedValues.insert(&I);
-        else
-          Stores.push_back(cast<StoreInst>(&I));
-      }
-    }
-
     // Try to find an unroll count that maximizes the use of the instruction
     // window, i.e. trying to fetch as many instructions per cycle as possible.
     unsigned MaxInstsPerLine = 16;
@@ -4902,8 +4953,32 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
       UC++;
     }
 
-    if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
-          return LoadedValues.contains(SI->getOperand(0));
+    if (BestUC == 1)
+      return;
+
+    SmallPtrSet<Value *, 8> LoadedValuesPlus;
+    SmallVector<StoreInst *> Stores;
+    for (auto *BB : L->blocks()) {
+      for (auto &I : *BB) {
+        Value *Ptr = getLoadStorePointerOperand(&I);
+        if (!Ptr)
+          continue;
+        const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+        if (SE.isLoopInvariant(PtrSCEV, L))
+          continue;
+        if (isa<LoadInst>(&I)) {
+          LoadedValuesPlus.insert(&I);
+          // Include in-loop 1st users of loaded values.
+          for (auto *U : I.users())
+            if (L->contains(cast<Instruction>(U)))
+              LoadedValuesPlus.insert(U);
+        } else
+          Stores.push_back(cast<StoreInst>(&I));
+      }
+    }
+
+    if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
+          return LoadedValuesPlus.contains(SI->getOperand(0));
         }))
       return;
 
@@ -4915,7 +4990,6 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
   // Try to runtime-unroll loops with early-continues depending on loop-varying
   // loads; this helps with branch-prediction for the early-continues.
   auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
-  auto *Latch = L->getLoopLatch();
   SmallVector<BasicBlock *> Preds(predecessors(Latch));
   if (!Term || !Term->isConditional() || Preds.size() == 1 ||
       !llvm::is_contained(Preds, Header) ||
@@ -5151,6 +5225,8 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
     return false;
 
   switch (RdxDesc.getRecurrenceKind()) {
+  case RecurKind::Sub:
+  case RecurKind::AddChainWithSubs:
   case RecurKind::Add:
   case RecurKind::FAdd:
   case RecurKind::And:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 7f45177..42ae962 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -221,6 +221,11 @@ public:
                                      unsigned Index) const override;
 
   InstructionCost
+  getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                   TTI::TargetCostKind CostKind,
+                                   unsigned Index) const override;
+
+  InstructionCost
   getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
                          TTI::TargetCostKind CostKind) const override;
 
@@ -238,8 +243,9 @@ public:
       ArrayRef<const Value *> Args = {},
       const Instruction *CxtI = nullptr) const override;
 
-  InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
-                                            const SCEV *Ptr) const override;
+  InstructionCost
+  getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+                            TTI::TargetCostKind CostKind) const override;
 
   InstructionCost getCmpSelInstrCost(
       unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
@@ -435,6 +441,14 @@ public:
 
   bool preferPredicatedReductionSelect() const override { return ST->hasSVE(); }
 
+  /// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the
+  /// architecture features are not present.
+  std::optional<InstructionCost>
+  getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind,
+                         TTI::OperandValueInfo Op1Info,
+                         TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+                         std::function<InstructionCost(Type *)> InstCost) const;
+
   InstructionCost
   getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                              std::optional<FastMathFlags> FMF,
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 66136a4..803943f 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -89,6 +89,7 @@ add_llvm_target(AArch64CodeGen
   SMEABIPass.cpp
   SMEPeepholeOpt.cpp
   SVEIntrinsicOpts.cpp
+  MachineSMEABIPass.cpp
   AArch64SIMDInstrOpt.cpp
 
   DEPENDS
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 2155ace..79bef76 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -128,9 +128,9 @@ struct AArch64OutgoingValueAssigner
     if (!Flags.isVarArg() && !UseVarArgsCCForFixed) {
       if (!IsReturn)
         applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
-      Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+      Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State);
     } else
-      Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+      Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State);
 
     StackSize = State.getStackSize();
     return Res;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index f359731..ee34a85 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1349,7 +1349,9 @@ AArch64InstructionSelector::emitSelect(Register Dst, Register True,
   return &*SelectInst;
 }
 
-static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
+static AArch64CC::CondCode
+changeICMPPredToAArch64CC(CmpInst::Predicate P, Register RHS = {},
+                          MachineRegisterInfo *MRI = nullptr) {
   switch (P) {
   default:
     llvm_unreachable("Unknown condition code!");
@@ -1360,8 +1362,18 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
   case CmpInst::ICMP_SGT:
     return AArch64CC::GT;
   case CmpInst::ICMP_SGE:
+    if (RHS && MRI) {
+      auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
+      if (ValAndVReg && ValAndVReg->Value == 0)
+        return AArch64CC::PL;
+    }
     return AArch64CC::GE;
   case CmpInst::ICMP_SLT:
+    if (RHS && MRI) {
+      auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
+      if (ValAndVReg && ValAndVReg->Value == 0)
+        return AArch64CC::MI;
+    }
     return AArch64CC::LT;
   case CmpInst::ICMP_SLE:
     return AArch64CC::LE;
@@ -1813,7 +1825,8 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
   auto &PredOp = ICmp.getOperand(1);
   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
-      static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
+      static_cast<CmpInst::Predicate>(PredOp.getPredicate()),
+      ICmp.getOperand(3).getReg(), MIB.getMRI());
   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
   I.eraseFromParent();
   return true;
@@ -2510,8 +2523,8 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
-    const AArch64CC::CondCode InvCC =
-        changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
+    const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
+        CmpInst::getInversePredicate(Pred), Cmp->getOperand(3).getReg(), &MRI);
     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
     I.eraseFromParent();
     return true;
@@ -3577,8 +3590,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     auto &PredOp = I.getOperand(1);
     emitIntegerCompare(I.getOperand(2), I.getOperand(3), PredOp, MIB);
     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
-    const AArch64CC::CondCode InvCC =
-        changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
+    const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
+        CmpInst::getInversePredicate(Pred), I.getOperand(3).getReg(), &MRI);
     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
               /*Src2=*/AArch64::WZR, InvCC, MIB);
     I.eraseFromParent();
@@ -4931,7 +4944,7 @@ MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
     if (Negate)
       CC = CmpInst::getInversePredicate(CC);
     if (isa<GICmp>(Cmp)) {
-      OutCC = changeICMPPredToAArch64CC(CC);
+      OutCC = changeICMPPredToAArch64CC(CC, RHS, MIB.getMRI());
     } else {
       // Handle special FP cases.
       AArch64CC::CondCode ExtraCC;
@@ -5101,7 +5114,8 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), PredOp,
                        MIB);
     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
-    CondCode = changeICMPPredToAArch64CC(Pred);
+    CondCode =
+        changeICMPPredToAArch64CC(Pred, CondDef->getOperand(3).getReg(), &MRI);
   } else {
     // Get the condition code for the select.
     auto Pred =
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 3ba08c8..6025f1c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -614,8 +614,7 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P,
     // x uge c => x ugt c - 1
     //
     // When c is not zero.
-    if (C == 0)
-      return std::nullopt;
+    assert(C != 0 && "C should not be zero here!");
     P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
     C -= 1;
     break;
@@ -656,14 +655,13 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P,
   if (isLegalArithImmed(C))
     return {{C, P}};
 
-  auto IsMaterializableInSingleInstruction = [=](uint64_t Imm) {
+  auto NumberOfInstrToLoadImm = [=](uint64_t Imm) {
     SmallVector<AArch64_IMM::ImmInsnModel> Insn;
     AArch64_IMM::expandMOVImm(Imm, 32, Insn);
-    return Insn.size() == 1;
+    return Insn.size();
   };
 
-  if (!IsMaterializableInSingleInstruction(OriginalC) &&
-      IsMaterializableInSingleInstruction(C))
+  if (NumberOfInstrToLoadImm(OriginalC) > NumberOfInstrToLoadImm(C))
     return {{C, P}};
 
   return std::nullopt;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 45ac023..a388216 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -40,6 +40,7 @@ protected:
                         bool IsPCRel) const override;
   bool needsRelocateWithSymbol(const MCValue &, unsigned Type) const override;
   bool isNonILP32reloc(const MCFixup &Fixup, AArch64::Specifier RefKind) const;
+  void sortRelocs(std::vector<ELFRelocationEntry> &Relocs) override;
 
   bool IsILP32;
 };
@@ -498,6 +499,17 @@ bool AArch64ELFObjectWriter::needsRelocateWithSymbol(const MCValue &Val,
                       Val.getSpecifier());
 }
 
+void AArch64ELFObjectWriter::sortRelocs(
+    std::vector<ELFRelocationEntry> &Relocs) {
+  // PATCHINST relocations should be applied last because they may overwrite the
+  // whole instruction and so should take precedence over other relocations that
+  // modify operands of the original instruction.
+  std::stable_partition(Relocs.begin(), Relocs.end(),
+                        [](const ELFRelocationEntry &R) {
+                          return R.Type != ELF::R_AARCH64_PATCHINST;
+                        });
+}
+
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) {
   return std::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 14547e3..917dbdf 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -35,7 +35,6 @@
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
 #include "llvm/Support/AArch64BuildAttributes.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 3c8b571..54b58e9 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1017,14 +1017,22 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
   else
     return false;
 
+  StringRef Reg = getRegisterName(MI->getOperand(4).getReg());
+  bool NotXZR = Reg != "xzr";
+
+  // If a mandatory is not specified in the TableGen
+  // (i.e. no register operand should be present), and the register value
+  // is not xzr/x31, then disassemble to a SYS alias instead.
+  if (NotXZR && !NeedsReg)
+    return false;
+
   std::string Str = Ins + Name;
   llvm::transform(Str, Str.begin(), ::tolower);
 
   O << '\t' << Str;
-  if (NeedsReg) {
-    O << ", ";
-    printRegName(O, MI->getOperand(4).getReg());
-  }
+
+  if (NeedsReg)
+    O << ", " << Reg;
 
   return true;
 }
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
new file mode 100644
index 0000000..b58dfdf
--- /dev/null
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -0,0 +1,696 @@
+//===- MachineSMEABIPass.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the SME ABI requirements for ZA state. This includes
+// implementing the lazy ZA state save schemes around calls.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass works by collecting instructions that require ZA to be in a
+// specific state (e.g., "ACTIVE" or "SAVED") and inserting the necessary state
+// transitions to ensure ZA is in the required state before instructions. State
+// transitions represent actions such as setting up or restoring a lazy save.
+// Certain points within a function may also have predefined states independent
+// of any instructions, for example, a "shared_za" function is always entered
+// and exited in the "ACTIVE" state.
+//
+// To handle ZA state across control flow, we make use of edge bundling. This
+// assigns each block an "incoming" and "outgoing" edge bundle (representing
+// incoming and outgoing edges). Initially, these are unique to each block;
+// then, in the process of forming bundles, the outgoing block of a block is
+// joined with the incoming bundle of all successors. The result is that each
+// bundle can be assigned a single ZA state, which ensures the state required by
+// all a blocks' successors is the same, and that each basic block will always
+// be entered with the same ZA state. This eliminates the need for splitting
+// edges to insert state transitions or "phi" nodes for ZA states.
+//
+// See below for a simple example of edge bundling.
+//
+// The following shows a conditionally executed basic block (BB1):
+//
+// if (cond)
+//   BB1
+// BB2
+//
+// Initial Bundles         Joined Bundles
+//
+//   ┌──0──┐                ┌──0──┐
+//   │ BB0 │                │ BB0 │
+//   └──1──┘                └──1──┘
+//      ├───────┐              ├───────┐
+//      ▼       │              ▼       │
+//   ┌──2──┐    │   ─────►  ┌──1──┐    │
+//   │ BB1 │    ▼           │ BB1 │    ▼
+//   └──3──┘ ┌──4──┐        └──1──┘ ┌──1──┐
+//      └───►4 BB2 │           └───►1 BB2 │
+//           └──5──┘                └──2──┘
+//
+// On the left are the initial per-block bundles, and on the right are the
+// joined bundles (which are the result of the EdgeBundles analysis).
+
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-machine-sme-abi"
+
+namespace {
+
+enum ZAState {
+  // Any/unknown state (not valid)
+  ANY = 0,
+
+  // ZA is in use and active (i.e. within the accumulator)
+  ACTIVE,
+
+  // A ZA save has been set up or committed (i.e. ZA is dormant or off)
+  LOCAL_SAVED,
+
+  // ZA is off or a lazy save has been set up by the caller
+  CALLER_DORMANT,
+
+  // ZA is off
+  OFF,
+
+  // The number of ZA states (not a valid state)
+  NUM_ZA_STATE
+};
+
+/// A bitmask enum to record live physical registers that the "emit*" routines
+/// may need to preserve. Note: This only tracks registers we may clobber.
+enum LiveRegs : uint8_t {
+  None = 0,
+  NZCV = 1 << 0,
+  W0 = 1 << 1,
+  W0_HI = 1 << 2,
+  X0 = W0 | W0_HI,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ W0_HI)
+};
+
+/// Holds the virtual registers live physical registers have been saved to.
+struct PhysRegSave {
+  LiveRegs PhysLiveRegs;
+  Register StatusFlags = AArch64::NoRegister;
+  Register X0Save = AArch64::NoRegister;
+};
+
+static bool isLegalEdgeBundleZAState(ZAState State) {
+  switch (State) {
+  case ZAState::ACTIVE:
+  case ZAState::LOCAL_SAVED:
+    return true;
+  default:
+    return false;
+  }
+}
+struct TPIDR2State {
+  int FrameIndex = -1;
+};
+
+StringRef getZAStateString(ZAState State) {
+#define MAKE_CASE(V)                                                           \
+  case V:                                                                      \
+    return #V;
+  switch (State) {
+    MAKE_CASE(ZAState::ANY)
+    MAKE_CASE(ZAState::ACTIVE)
+    MAKE_CASE(ZAState::LOCAL_SAVED)
+    MAKE_CASE(ZAState::CALLER_DORMANT)
+    MAKE_CASE(ZAState::OFF)
+  default:
+    llvm_unreachable("Unexpected ZAState");
+  }
+#undef MAKE_CASE
+}
+
+static bool isZAorZT0RegOp(const TargetRegisterInfo &TRI,
+                           const MachineOperand &MO) {
+  if (!MO.isReg() || !MO.getReg().isPhysical())
+    return false;
+  return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) {
+    return AArch64::MPR128RegClass.contains(SR) ||
+           AArch64::ZTRRegClass.contains(SR);
+  });
+}
+
+/// Returns the required ZA state needed before \p MI and an iterator pointing
+/// to where any code required to change the ZA state should be inserted.
+static std::pair<ZAState, MachineBasicBlock::iterator>
+getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
+                     bool ZAOffAtReturn) {
+  MachineBasicBlock::iterator InsertPt(MI);
+
+  if (MI.getOpcode() == AArch64::InOutZAUsePseudo)
+    return {ZAState::ACTIVE, std::prev(InsertPt)};
+
+  if (MI.getOpcode() == AArch64::RequiresZASavePseudo)
+    return {ZAState::LOCAL_SAVED, std::prev(InsertPt)};
+
+  if (MI.isReturn())
+    return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
+
+  for (auto &MO : MI.operands()) {
+    if (isZAorZT0RegOp(TRI, MO))
+      return {ZAState::ACTIVE, InsertPt};
+  }
+
+  return {ZAState::ANY, InsertPt};
+}
+
+struct MachineSMEABI : public MachineFunctionPass {
+  inline static char ID = 0;
+
+  MachineSMEABI() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "Machine SME ABI pass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<EdgeBundlesWrapperLegacy>();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// Collects the needed ZA state (and live registers) before each instruction
+  /// within the machine function.
+  void collectNeededZAStates(SMEAttrs);
+
+  /// Assigns each edge bundle a ZA state based on the needed states of blocks
+  /// that have incoming or outgoing edges in that bundle.
+  void assignBundleZAStates();
+
+  /// Inserts code to handle changes between ZA states within the function.
+  /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
+  void insertStateChanges();
+
+  // Emission routines for private and shared ZA functions (using lazy saves).
+  void emitNewZAPrologue(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI);
+  void emitRestoreLazySave(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           LiveRegs PhysLiveRegs);
+  void emitSetupLazySave(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI);
+  void emitAllocateLazySaveBuffer(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI);
+  void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                 bool ClearTPIDR2);
+
+  void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                       ZAState From, ZAState To, LiveRegs PhysLiveRegs);
+
+  /// Save live physical registers to virtual registers.
+  PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI, DebugLoc DL);
+  /// Restore physical registers from a save of their previous values.
+  void restorePhyRegSave(PhysRegSave const &RegSave, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI, DebugLoc DL);
+
+  /// Get or create a TPIDR2 block in this function.
+  TPIDR2State getTPIDR2Block();
+
+private:
+  /// Contains the needed ZA state (and live registers) at an instruction.
+  struct InstInfo {
+    ZAState NeededState{ZAState::ANY};
+    MachineBasicBlock::iterator InsertPt;
+    LiveRegs PhysLiveRegs = LiveRegs::None;
+  };
+
+  /// Contains the needed ZA state for each instruction in a block.
+  /// Instructions that do not require a ZA state are not recorded.
+  struct BlockInfo {
+    ZAState FixedEntryState{ZAState::ANY};
+    SmallVector<InstInfo> Insts;
+    LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
+  };
+
+  // All pass state that must be cleared between functions.
+  struct PassState {
+    SmallVector<BlockInfo> Blocks;
+    SmallVector<ZAState> BundleStates;
+    std::optional<TPIDR2State> TPIDR2Block;
+  } State;
+
+  MachineFunction *MF = nullptr;
+  EdgeBundles *Bundles = nullptr;
+  const AArch64Subtarget *Subtarget = nullptr;
+  const AArch64RegisterInfo *TRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+};
+
+void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
+  assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) &&
+         "Expected function to have ZA/ZT0 state!");
+
+  State.Blocks.resize(MF->getNumBlockIDs());
+  for (MachineBasicBlock &MBB : *MF) {
+    BlockInfo &Block = State.Blocks[MBB.getNumber()];
+    if (&MBB == &MF->front()) {
+      // Entry block:
+      Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
+                                  ? ZAState::CALLER_DORMANT
+                                  : ZAState::ACTIVE;
+    } else if (MBB.isEHPad()) {
+      // EH entry block:
+      Block.FixedEntryState = ZAState::LOCAL_SAVED;
+    }
+
+    LiveRegUnits LiveUnits(*TRI);
+    LiveUnits.addLiveOuts(MBB);
+
+    auto GetPhysLiveRegs = [&] {
+      LiveRegs PhysLiveRegs = LiveRegs::None;
+      if (!LiveUnits.available(AArch64::NZCV))
+        PhysLiveRegs |= LiveRegs::NZCV;
+      // We have to track W0 and X0 separately as otherwise things can get
+      // confused if we attempt to preserve X0 but only W0 was defined.
+      if (!LiveUnits.available(AArch64::W0))
+        PhysLiveRegs |= LiveRegs::W0;
+      if (!LiveUnits.available(AArch64::W0_HI))
+        PhysLiveRegs |= LiveRegs::W0_HI;
+      return PhysLiveRegs;
+    };
+
+    Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
+    auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
+    for (MachineInstr &MI : reverse(MBB)) {
+      MachineBasicBlock::iterator MBBI(MI);
+      LiveUnits.stepBackward(MI);
+      LiveRegs PhysLiveRegs = GetPhysLiveRegs();
+      auto [NeededState, InsertPt] = getZAStateBeforeInst(
+          *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
+      assert((InsertPt == MBBI ||
+              InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) &&
+             "Unexpected state change insertion point!");
+      // TODO: Do something to avoid state changes where NZCV is live.
+      if (MBBI == FirstTerminatorInsertPt)
+        Block.PhysLiveRegsAtExit = PhysLiveRegs;
+      if (NeededState != ZAState::ANY)
+        Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
+    }
+
+    // Reverse vector (as we had to iterate backwards for liveness).
+    std::reverse(Block.Insts.begin(), Block.Insts.end());
+  }
+}
+
+void MachineSMEABI::assignBundleZAStates() {
+  State.BundleStates.resize(Bundles->getNumBundles());
+  for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) {
+    LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
+
+    // Attempt to assign a ZA state for this bundle that minimizes state
+    // transitions. Edges within loops are given a higher weight as we assume
+    // they will be executed more than once.
+    // TODO: We should propagate desired incoming/outgoing states through blocks
+    // that have the "ANY" state first to make better global decisions.
+    int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
+    for (unsigned BlockID : Bundles->getBlocks(I)) {
+      LLVM_DEBUG(dbgs() << "- bb." << BlockID);
+
+      const BlockInfo &Block = State.Blocks[BlockID];
+      if (Block.Insts.empty()) {
+        LLVM_DEBUG(dbgs() << " (no state preference)\n");
+        continue;
+      }
+      bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
+      bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
+
+      ZAState DesiredIncomingState = Block.Insts.front().NeededState;
+      if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
+        EdgeStateCounts[DesiredIncomingState]++;
+        LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
+                          << getZAStateString(DesiredIncomingState));
+      }
+      ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
+      if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
+        EdgeStateCounts[DesiredOutgoingState]++;
+        LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
+                          << getZAStateString(DesiredOutgoingState));
+      }
+      LLVM_DEBUG(dbgs() << '\n');
+    }
+
+    ZAState BundleState =
+        ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
+
+    // Force ZA to be active in bundles that don't have a preferred state.
+    // TODO: Something better here (to avoid extra mode switches).
+    if (BundleState == ZAState::ANY)
+      BundleState = ZAState::ACTIVE;
+
+    LLVM_DEBUG({
+      dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n'
+             << "Edge counts:";
+      for (auto [State, Count] : enumerate(EdgeStateCounts))
+        dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count;
+      dbgs() << "\n\n";
+    });
+
+    State.BundleStates[I] = BundleState;
+  }
+}
+
+void MachineSMEABI::insertStateChanges() {
+  for (MachineBasicBlock &MBB : *MF) {
+    const BlockInfo &Block = State.Blocks[MBB.getNumber()];
+    ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(),
+                                                            /*Out=*/false)];
+
+    ZAState CurrentState = Block.FixedEntryState;
+    if (CurrentState == ZAState::ANY)
+      CurrentState = InState;
+
+    for (auto &Inst : Block.Insts) {
+      if (CurrentState != Inst.NeededState)
+        emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
+                        Inst.PhysLiveRegs);
+      CurrentState = Inst.NeededState;
+    }
+
+    if (MBB.succ_empty())
+      continue;
+
+    ZAState OutState =
+        State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)];
+    if (CurrentState != OutState)
+      emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
+                      Block.PhysLiveRegsAtExit);
+  }
+}
+
+TPIDR2State MachineSMEABI::getTPIDR2Block() {
+  if (State.TPIDR2Block)
+    return *State.TPIDR2Block;
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  State.TPIDR2Block = TPIDR2State{MFI.CreateStackObject(16, Align(16), false)};
+  return *State.TPIDR2Block;
+}
+
+static DebugLoc getDebugLoc(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI) {
+  if (MBBI != MBB.end())
+    return MBBI->getDebugLoc();
+  return DebugLoc();
+}
+
+void MachineSMEABI::emitSetupLazySave(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI) {
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+
+  // Get pointer to TPIDR2 block.
+  Register TPIDR2 = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
+  Register TPIDR2Ptr = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
+      .addFrameIndex(getTPIDR2Block().FrameIndex)
+      .addImm(0)
+      .addImm(0);
+  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), TPIDR2Ptr)
+      .addReg(TPIDR2);
+  // Set TPIDR2_EL0 to point to TPIDR2 block.
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
+      .addImm(AArch64SysReg::TPIDR2_EL0)
+      .addReg(TPIDR2Ptr);
+}
+
+PhysRegSave MachineSMEABI::createPhysRegSave(LiveRegs PhysLiveRegs,
+                                             MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MBBI,
+                                             DebugLoc DL) {
+  PhysRegSave RegSave{PhysLiveRegs};
+  if (PhysLiveRegs & LiveRegs::NZCV) {
+    RegSave.StatusFlags = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), RegSave.StatusFlags)
+        .addImm(AArch64SysReg::NZCV)
+        .addReg(AArch64::NZCV, RegState::Implicit);
+  }
+  // Note: Preserving X0 is "free" as this is before register allocation, so
+  // the register allocator is still able to optimize these copies.
+  if (PhysLiveRegs & LiveRegs::W0) {
+    RegSave.X0Save = MRI->createVirtualRegister(PhysLiveRegs & LiveRegs::W0_HI
+                                                    ? &AArch64::GPR64RegClass
+                                                    : &AArch64::GPR32RegClass);
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), RegSave.X0Save)
+        .addReg(PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0);
+  }
+  return RegSave;
+}
+
+void MachineSMEABI::restorePhyRegSave(PhysRegSave const &RegSave,
+                                      MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      DebugLoc DL) {
+  if (RegSave.StatusFlags != AArch64::NoRegister)
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
+        .addImm(AArch64SysReg::NZCV)
+        .addReg(RegSave.StatusFlags)
+        .addReg(AArch64::NZCV, RegState::ImplicitDefine);
+
+  if (RegSave.X0Save != AArch64::NoRegister)
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY),
+            RegSave.PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0)
+        .addReg(RegSave.X0Save);
+}
+
+void MachineSMEABI::emitRestoreLazySave(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        LiveRegs PhysLiveRegs) {
+  auto *TLI = Subtarget->getTargetLowering();
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+  Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  Register TPIDR2 = AArch64::X0;
+
+  // TODO: Emit these within the restore MBB to prevent unnecessary saves.
+  PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
+
+  // Enable ZA.
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
+      .addImm(AArch64SVCR::SVCRZA)
+      .addImm(1);
+  // Get current TPIDR2_EL0.
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), TPIDR2EL0)
+      .addImm(AArch64SysReg::TPIDR2_EL0);
+  // Get pointer to TPIDR2 block.
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
+      .addFrameIndex(getTPIDR2Block().FrameIndex)
+      .addImm(0)
+      .addImm(0);
+  // (Conditionally) restore ZA state.
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::RestoreZAPseudo))
+      .addReg(TPIDR2EL0)
+      .addReg(TPIDR2)
+      .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_RESTORE))
+      .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
+  // Zero TPIDR2_EL0.
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
+      .addImm(AArch64SysReg::TPIDR2_EL0)
+      .addReg(AArch64::XZR);
+
+  restorePhyRegSave(RegSave, MBB, MBBI, DL);
+}
+
+void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              bool ClearTPIDR2) {
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+
+  if (ClearTPIDR2)
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
+        .addImm(AArch64SysReg::TPIDR2_EL0)
+        .addReg(AArch64::XZR);
+
+  // Disable ZA.
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
+      .addImm(AArch64SVCR::SVCRZA)
+      .addImm(0);
+}
+
+void MachineSMEABI::emitAllocateLazySaveBuffer(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+  Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+
+  // Calculate SVL.
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
+
+  // 1. Allocate the lazy save buffer.
+  {
+    // TODO This function grows the stack with a subtraction, which doesn't work
+    // on Windows. Some refactoring to share the functionality in
+    // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
+    // supports SME
+    assert(!Subtarget->isTargetWindows() &&
+           "Lazy ZA save is not yet supported on Windows");
+    // Get original stack pointer.
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
+        .addReg(AArch64::SP);
+    // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSUBXrrr), Buffer)
+        .addReg(SVL)
+        .addReg(SVL)
+        .addReg(SP);
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::SP)
+        .addReg(Buffer);
+    // We have just allocated a variable sized object, tell this to PEI.
+    MFI.CreateVariableSizedObject(Align(16), nullptr);
+  }
+
+  // 2. Setup the TPIDR2 block.
+  {
+    // Note: This case just needs to do `SVL << 48`. It is not implemented as we
+    // generally don't support big-endian SVE/SME.
+    if (!Subtarget->isLittleEndian())
+      reportFatalInternalError(
+          "TPIDR2 block initialization is not supported on big-endian targets");
+
+    // Store buffer pointer and num_za_save_slices.
+    // Bytes 10-15 are implicitly zeroed.
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::STPXi))
+        .addReg(Buffer)
+        .addReg(SVL)
+        .addFrameIndex(getTPIDR2Block().FrameIndex)
+        .addImm(0);
+  }
+}
+
+void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI) {
+  auto *TLI = Subtarget->getTargetLowering();
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+
+  // Get current TPIDR2_EL0.
+  Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS))
+      .addReg(TPIDR2EL0, RegState::Define)
+      .addImm(AArch64SysReg::TPIDR2_EL0);
+  // If TPIDR2_EL0 is non-zero, commit the lazy save.
+  // NOTE: Functions that only use ZT0 don't need to zero ZA.
+  bool ZeroZA =
+      MF->getInfo<AArch64FunctionInfo>()->getSMEFnAttrs().hasZAState();
+  auto CommitZASave =
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
+          .addReg(TPIDR2EL0)
+          .addImm(ZeroZA ? 1 : 0)
+          .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE))
+          .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
+  if (ZeroZA)
+    CommitZASave.addDef(AArch64::ZAB0, RegState::ImplicitDefine);
+  // Enable ZA (as ZA could have previously been in the OFF state).
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
+      .addImm(AArch64SVCR::SVCRZA)
+      .addImm(1);
+}
+
+void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator InsertPt,
+                                    ZAState From, ZAState To,
+                                    LiveRegs PhysLiveRegs) {
+
+  // ZA not used.
+  if (From == ZAState::ANY || To == ZAState::ANY)
+    return;
+
+  // If we're exiting from the CALLER_DORMANT state that means this new ZA
+  // function did not touch ZA (so ZA was never turned on).
+  if (From == ZAState::CALLER_DORMANT && To == ZAState::OFF)
+    return;
+
+  // TODO: Avoid setting up the save buffer if there's no transition to
+  // LOCAL_SAVED.
+  if (From == ZAState::CALLER_DORMANT) {
+    assert(MBB.getParent()
+               ->getInfo<AArch64FunctionInfo>()
+               ->getSMEFnAttrs()
+               .hasPrivateZAInterface() &&
+           "CALLER_DORMANT state requires private ZA interface");
+    assert(&MBB == &MBB.getParent()->front() &&
+           "CALLER_DORMANT state only valid in entry block");
+    emitNewZAPrologue(MBB, MBB.getFirstNonPHI());
+    if (To == ZAState::ACTIVE)
+      return; // Nothing more to do (ZA is active after the prologue).
+
+    // Note: "emitNewZAPrologue" zeros ZA, so we may need to setup a lazy save
+    // if "To" is "ZAState::LOCAL_SAVED". It may be possible to improve this
+    // case by changing the placement of the zero instruction.
+    From = ZAState::ACTIVE;
+  }
+
+  if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
+    emitSetupLazySave(MBB, InsertPt);
+  else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
+    emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs);
+  else if (To == ZAState::OFF) {
+    assert(From != ZAState::CALLER_DORMANT &&
+           "CALLER_DORMANT to OFF should have already been handled");
+    emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
+  } else {
+    dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
+           << getZAStateString(To) << '\n';
+    llvm_unreachable("Unimplemented state transition");
+  }
+}
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", "Machine SME ABI",
+                false, false)
+
+bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
+    return false;
+
+  auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+  SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
+  if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State())
+    return false;
+
+  assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
+
+  // Reset pass state.
+  State = PassState{};
+  this->MF = &MF;
+  Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+  Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+  TII = Subtarget->getInstrInfo();
+  TRI = Subtarget->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  collectNeededZAStates(SMEFnAttrs);
+  assignBundleZAStates();
+  insertStateChanges();
+
+  // Allocate save buffer (if needed).
+  if (State.TPIDR2Block) {
+    MachineBasicBlock &EntryBlock = MF.front();
+    emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 4af4d49..2008516 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -15,11 +15,16 @@
 #include "AArch64.h"
 #include "Utils/AArch64SMEAttributes.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
 using namespace llvm;
@@ -33,9 +38,13 @@ struct SMEABI : public FunctionPass {
 
   bool runOnFunction(Function &F) override;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+  }
+
 private:
   bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder,
-                               SMEAttrs FnAttrs);
+                               SMEAttrs FnAttrs, const TargetLowering &TLI);
 };
 } // end anonymous namespace
 
@@ -51,14 +60,16 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); }
 //===----------------------------------------------------------------------===//
 
 // Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0.
-void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) {
+void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI,
+                    bool ZT0IsUndef = false) {
   auto &Ctx = M->getContext();
   auto *TPIDR2SaveTy =
       FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false);
   auto Attrs =
       AttributeList().addFnAttribute(Ctx, "aarch64_pstate_sm_compatible");
+  RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_SAVE;
   FunctionCallee Callee =
-      M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs);
+      M->getOrInsertFunction(TLI.getLibcallName(LC), TPIDR2SaveTy, Attrs);
   CallInst *Call = Builder.CreateCall(Callee);
 
   // If ZT0 is undefined (i.e. we're at the entry of a "new_zt0" function), mark
@@ -67,8 +78,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) {
   if (ZT0IsUndef)
     Call->addFnAttr(Attribute::get(Ctx, "aarch64_zt0_undef"));
 
-  Call->setCallingConv(
-      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0);
+  Call->setCallingConv(TLI.getLibcallCallingConv(LC));
 
   // A save to TPIDR2 should be followed by clearing TPIDR2_EL0.
   Function *WriteIntr =
@@ -98,7 +108,8 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) {
 /// interface if it does not share ZA or ZT0.
 ///
 bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
-                                     IRBuilder<> &Builder, SMEAttrs FnAttrs) {
+                                     IRBuilder<> &Builder, SMEAttrs FnAttrs,
+                                     const TargetLowering &TLI) {
   LLVMContext &Context = F->getContext();
   BasicBlock *OrigBB = &F->getEntryBlock();
   Builder.SetInsertPoint(&OrigBB->front());
@@ -124,7 +135,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
 
     // Create a call __arm_tpidr2_save, which commits the lazy save.
     Builder.SetInsertPoint(&SaveBB->back());
-    emitTPIDR2Save(M, Builder, /*ZT0IsUndef=*/FnAttrs.isNewZT0());
+    emitTPIDR2Save(M, Builder, TLI, /*ZT0IsUndef=*/FnAttrs.isNewZT0());
 
     // Enable pstate.za at the start of the function.
     Builder.SetInsertPoint(&OrigBB->front());
@@ -172,10 +183,14 @@ bool SMEABI::runOnFunction(Function &F) {
   if (F.isDeclaration() || F.hasFnAttribute("aarch64_expanded_pstate_za"))
     return false;
 
+  const TargetMachine &TM =
+      getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  const TargetLowering &TLI = *TM.getSubtargetImpl(F)->getTargetLowering();
+
   bool Changed = false;
   SMEAttrs FnAttrs(F);
   if (FnAttrs.isNewZA() || FnAttrs.isNewZT0())
-    Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs);
+    Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs, TLI);
 
   return Changed;
 }
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index bd28716..564af67 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -80,16 +80,10 @@ static bool isMatchingStartStopPair(const MachineInstr *MI1,
   if (MI1->getOperand(4).getRegMask() != MI2->getOperand(4).getRegMask())
     return false;
 
-  // This optimisation is unlikely to happen in practice for conditional
-  // smstart/smstop pairs as the virtual registers for pstate.sm will always
-  // be different.
-  // TODO: For this optimisation to apply to conditional smstart/smstop,
-  // this pass will need to do more work to remove redundant calls to
-  // __arm_sme_state.
-
   // Only consider conditional start/stop pairs which read the same register
-  // holding the original value of pstate.sm, as some conditional start/stops
-  // require the state on entry to the function.
+  // holding the original value of pstate.sm. This is somewhat over conservative
+  // as all conditional streaming mode changes only look at the state on entry
+  // to the function.
   if (MI1->getOperand(3).isReg() && MI2->getOperand(3).isReg()) {
     Register Reg1 = MI1->getOperand(3).getReg();
     Register Reg2 = MI2->getOperand(3).getReg();
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a0320f9..a3a7d0f 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -809,6 +809,11 @@ let hasNoSchedulingInfo = 1 in {
     Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zs3), []> {
     let FalseLanes = flags;
   }
+
+  class UnpredRegImmPseudo<ZPRRegOp zprty, Operand immty>
+  : SVEPseudo2Instr<NAME, 0>,
+    Pseudo<(outs zprty:$Zd), (ins zprty:$Zs, immty:$imm), []> {
+  }
 }
 
 //
@@ -1885,13 +1890,14 @@ class sve_int_perm_extract_i<string asm>
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = DestructiveOther;
+  let DestructiveInstType = Destructive2xRegImmUnpred;
   let ElementSize = ElementSizeNone;
   let hasSideEffects = 0;
 }
 
-multiclass sve_int_perm_extract_i<string asm, SDPatternOperator op> {
-  def NAME : sve_int_perm_extract_i<asm>;
+multiclass sve_int_perm_extract_i<string asm, SDPatternOperator op, string Ps> {
+  def NAME : sve_int_perm_extract_i<asm>,
+             SVEPseudo2Instr<Ps, 1>;
 
   def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, imm0_255,
                          !cast<Instruction>(NAME)>;
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index 271094f..dd6fa16 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -7,17 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64SMEAttributes.h"
+#include "AArch64ISelLowering.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/RuntimeLibcalls.h"
 #include <cassert>
 
 using namespace llvm;
 
-void SMEAttrs::set(unsigned M, bool Enable) {
-  if (Enable)
-    Bitmask |= M;
-  else
-    Bitmask &= ~M;
-
+void SMEAttrs::validate() const {
   // Streaming Mode Attrs
   assert(!(hasStreamingInterface() && hasStreamingCompatibleInterface()) &&
          "SM_Enabled and SM_Compatible are mutually exclusive");
@@ -77,19 +74,36 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
     Bitmask |= encodeZT0State(StateValue::New);
 }
 
-void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName) {
+void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName,
+                                     const AArch64TargetLowering &TLI) {
+  RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName);
+  if (Impl == RTLIB::Unsupported)
+    return;
   unsigned KnownAttrs = SMEAttrs::Normal;
-  if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state")
-    KnownAttrs |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine);
-  if (FuncName == "__arm_tpidr2_restore")
+  RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+  switch (LC) {
+  case RTLIB::SMEABI_SME_STATE:
+  case RTLIB::SMEABI_TPIDR2_SAVE:
+  case RTLIB::SMEABI_GET_CURRENT_VG:
+  case RTLIB::SMEABI_SME_STATE_SIZE:
+  case RTLIB::SMEABI_SME_SAVE:
+  case RTLIB::SMEABI_SME_RESTORE:
+    KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine;
+    break;
+  case RTLIB::SMEABI_ZA_DISABLE:
+  case RTLIB::SMEABI_TPIDR2_RESTORE:
     KnownAttrs |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) |
                   SMEAttrs::SME_ABI_Routine;
-  if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" ||
-      FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr")
+    break;
+  case RTLIB::SC_MEMCPY:
+  case RTLIB::SC_MEMMOVE:
+  case RTLIB::SC_MEMSET:
+  case RTLIB::SC_MEMCHR:
     KnownAttrs |= SMEAttrs::SM_Compatible;
-  if (FuncName == "__arm_sme_save" || FuncName == "__arm_sme_restore" ||
-      FuncName == "__arm_sme_state_size")
-    KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine;
+    break;
+  default:
+    break;
+  }
   set(KnownAttrs);
 }
 
@@ -110,11 +124,11 @@ bool SMECallAttrs::requiresSMChange() const {
   return true;
 }
 
-SMECallAttrs::SMECallAttrs(const CallBase &CB)
+SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI)
     : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal),
       Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) {
   if (auto *CalledFunction = CB.getCalledFunction())
-    CalledFn = SMEAttrs(*CalledFunction, SMEAttrs::InferAttrsFromName::Yes);
+    CalledFn = SMEAttrs(*CalledFunction, TLI);
 
   // FIXME: We probably should not allow SME attributes on direct calls but
   // clang duplicates streaming mode attributes at each callsite.
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index f1be0ecb..d26e3cd 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -13,6 +13,8 @@
 
 namespace llvm {
 
+class AArch64TargetLowering;
+
 class Function;
 class CallBase;
 class AttributeList;
@@ -48,19 +50,27 @@ public:
     CallSiteFlags_Mask = ZT0_Undef
   };
 
-  enum class InferAttrsFromName { No, Yes };
-
   SMEAttrs() = default;
   SMEAttrs(unsigned Mask) { set(Mask); }
-  SMEAttrs(const Function &F, InferAttrsFromName Infer = InferAttrsFromName::No)
+  SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr)
       : SMEAttrs(F.getAttributes()) {
-    if (Infer == InferAttrsFromName::Yes)
-      addKnownFunctionAttrs(F.getName());
+    if (TLI)
+      addKnownFunctionAttrs(F.getName(), *TLI);
   }
   SMEAttrs(const AttributeList &L);
-  SMEAttrs(StringRef FuncName) { addKnownFunctionAttrs(FuncName); };
+  SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) {
+    addKnownFunctionAttrs(FuncName, TLI);
+  };
 
-  void set(unsigned M, bool Enable = true);
+  void set(unsigned M, bool Enable = true) {
+    if (Enable)
+      Bitmask |= M;
+    else
+      Bitmask &= ~M;
+#ifndef NDEBUG
+    validate();
+#endif
+  }
 
   // Interfaces to query PSTATE.SM
   bool hasStreamingBody() const { return Bitmask & SM_Body; }
@@ -146,7 +156,9 @@ public:
   }
 
 private:
-  void addKnownFunctionAttrs(StringRef FuncName);
+  void addKnownFunctionAttrs(StringRef FuncName,
+                             const AArch64TargetLowering &TLI);
+  void validate() const;
 };
 
 /// SMECallAttrs is a utility class to hold the SMEAttrs for a callsite. It has
@@ -163,7 +175,7 @@ public:
                SMEAttrs Callsite = SMEAttrs::Normal)
       : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {}
 
-  SMECallAttrs(const CallBase &CB);
+  SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI);
 
   SMEAttrs &caller() { return CallerFn; }
   SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; }
@@ -194,7 +206,7 @@ public:
   }
 
   bool requiresEnablingZAAfterCall() const {
-    return requiresLazySave() || requiresDisablingZABeforeCall();
+    return requiresDisablingZABeforeCall();
   }
 
   bool requiresPreservingAllZAState() const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 007b481..0059a86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -439,10 +439,6 @@ struct AMDGPUPrintfRuntimeBindingPass
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
-struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-
 void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &);
 extern char &SIOptimizeExecMaskingPreRAID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index f266398..8e4b636 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
 
 def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
   "gfx12",
-  [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
+  [FeatureFP64, FeatureMIMG_R128,
    FeatureFlatAddressSpace, Feature16BitInsts,
    FeatureInv2PiInlineImm, FeatureApertureRegs,
    FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
@@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
 
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
+   FeatureAddressableLocalMemorySize65536,
    FeatureLDSBankCount32,
    FeatureDLInsts,
    FeatureDot7Insts,
@@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet<
   [FeatureGFX12,
    FeatureGFX1250Insts,
    FeatureCUStores,
+   FeatureAddressableLocalMemorySize327680,
    FeatureCuMode,
    Feature64BitLiterals,
    FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2a324e5..69722bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -41,6 +41,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Compiler.h"
@@ -719,6 +720,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                      IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
                      IsLocal),
+        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
+                     OutContext, IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
                      OutContext, IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
@@ -733,6 +736,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                      OutContext, IsLocal));
   }
 
+  // Emit _dvgpr$ symbol when appropriate.
+  emitDVgprSymbol(MF);
+
   if (isVerbose()) {
     MCSectionELF *CommentSection =
         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
@@ -803,6 +809,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
           " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
     }
 
+    if (AMDGPU::isGFX1250(STM)) {
+      const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
+      const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
+          CurrentProgramInfo.NamedBarCnt, BarBlkConst, Ctx);
+      const MCExpr *BarBlks =
+          MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
+      OutStreamer->emitRawComment(" NamedBarCnt: " + getMCExprStr(BarBlks),
+                                  false);
+    }
+
     OutStreamer->emitRawComment(
         " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
 
@@ -875,6 +891,49 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 
+// When appropriate, add a _dvgpr$ symbol, with the value of the function
+// symbol, plus an offset encoding one less than the number of VGPR blocks used
+// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
+// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
+// used by a front-end to have functions that are chained rather than called,
+// and a dispatcher that dynamically resizes the VGPR count before dispatching
+// to a function.
+void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  if (MFI.isDynamicVGPREnabled() &&
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) {
+    MCContext &Ctx = MF.getContext();
+    unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
+    MCValue NumVGPRs;
+    if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
+            NumVGPRs, nullptr) ||
+        !NumVGPRs.isAbsolute()) {
+      llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
+    }
+    // Calculate number of VGPR blocks.
+    // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
+    unsigned NumBlocks =
+        divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
+
+    if (NumBlocks > 8) {
+      OutContext.reportError({},
+                             "too many DVGPR blocks for _dvgpr$ symbol for '" +
+                                 Twine(CurrentFnSym->getName()) + "'");
+      return;
+    }
+    unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
+    // Add to function symbol to create _dvgpr$ symbol.
+    const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
+        MCSymbolRefExpr::create(CurrentFnSym, Ctx),
+        MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
+    MCSymbol *DVgprFuncSym =
+        Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
+    OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
+    emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
+    emitLinkage(&MF.getFunction(), DVgprFuncSym);
+  }
+}
+
 // TODO: Fold this into emitFunctionBodyStart.
 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
   // In the beginning all features are either 'Any' or 'NotSupported',
@@ -964,6 +1023,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DynamicCallStack =
       MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
                              GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
+  ProgInfo.NamedBarCnt = GetSymRefExpr(RIK::RIK_NumNamedBarrier);
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -997,89 +1057,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   const Function &F = MF.getFunction();
 
   // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
-  // dispatch registers are function args.
-  unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
-
-  if (isShader(F.getCallingConv())) {
-    bool IsPixelShader =
-        F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
-
-    // Calculate the number of VGPR registers based on the SPI input registers
-    uint32_t InputEna = 0;
-    uint32_t InputAddr = 0;
-    unsigned LastEna = 0;
-
-    if (IsPixelShader) {
-      // Note for IsPixelShader:
-      // By this stage, all enabled inputs are tagged in InputAddr as well.
-      // We will use InputAddr to determine whether the input counts against the
-      // vgpr total and only use the InputEnable to determine the last input
-      // that is relevant - if extra arguments are used, then we have to honour
-      // the InputAddr for any intermediate non-enabled inputs.
-      InputEna = MFI->getPSInputEnable();
-      InputAddr = MFI->getPSInputAddr();
-
-      // We only need to consider input args up to the last used arg.
-      assert((InputEna || InputAddr) &&
-             "PSInputAddr and PSInputEnable should "
-             "never both be 0 for AMDGPU_PS shaders");
-      // There are some rare circumstances where InputAddr is non-zero and
-      // InputEna can be set to 0. In this case we default to setting LastEna
-      // to 1.
-      LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
-    }
+  // dispatch registers as function args.
+  unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
+           WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
 
-    // FIXME: We should be using the number of registers determined during
-    // calling convention lowering to legalize the types.
-    const DataLayout &DL = F.getDataLayout();
-    unsigned PSArgCount = 0;
-    unsigned IntermediateVGPR = 0;
-    for (auto &Arg : F.args()) {
-      unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
-      if (Arg.hasAttribute(Attribute::InReg)) {
-        WaveDispatchNumSGPR += NumRegs;
-      } else {
-        // If this is a PS shader and we're processing the PS Input args (first
-        // 16 VGPR), use the InputEna and InputAddr bits to define how many
-        // VGPRs are actually used.
-        // Any extra VGPR arguments are handled as normal arguments (and
-        // contribute to the VGPR count whether they're used or not).
-        if (IsPixelShader && PSArgCount < 16) {
-          if ((1 << PSArgCount) & InputAddr) {
-            if (PSArgCount < LastEna)
-              WaveDispatchNumVGPR += NumRegs;
-            else
-              IntermediateVGPR += NumRegs;
-          }
-          PSArgCount++;
-        } else {
-          // If there are extra arguments we have to include the allocation for
-          // the non-used (but enabled with InputAddr) input arguments
-          if (IntermediateVGPR) {
-            WaveDispatchNumVGPR += IntermediateVGPR;
-            IntermediateVGPR = 0;
-          }
-          WaveDispatchNumVGPR += NumRegs;
-        }
-      }
-    }
+  if (WaveDispatchNumSGPR) {
     ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
-        {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+        {ProgInfo.NumSGPR,
+         MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
+                                 Ctx)},
+        Ctx);
+  }
 
+  if (WaveDispatchNumVGPR) {
     ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
         {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
 
     ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
         ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
-  } else if (isKernel(F.getCallingConv()) &&
-             MFI->getNumKernargPreloadedSGPRs()) {
-    // Consider cases where the total number of UserSGPRs with trailing
-    // allocated preload SGPRs, is greater than the number of explicitly
-    // referenced SGPRs.
-    const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
-        CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
-    ProgInfo.NumSGPR =
-        AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum
@@ -1168,7 +1163,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
   unsigned LDSAlignShift;
-  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
+    // LDS is allocated in 256 dword blocks.
+    LDSAlignShift = 10;
+  } else if (STM.getFeatureBits().test(
+                 FeatureAddressableLocalMemorySize163840)) {
     // LDS is allocated in 320 dword blocks.
     LDSAlignShift = 11;
   } else if (STM.getFeatureBits().test(
@@ -1205,8 +1204,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                               CreateExpr(STM.getWavefrontSize()), Ctx),
       CreateExpr(1ULL << ScratchAlignShift));
 
-  if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
+  if (STM.supportsWGP()) {
     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
+  }
+
+  if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
     ProgInfo.MemOrdered = 1;
     ProgInfo.FwdProgress = 1;
   }
@@ -1264,6 +1266,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
   }
 
+  if (AMDGPU::isGFX1250(STM))
+    ProgInfo.ComputePGMRSrc3 =
+        SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
+
   ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
       STM.computeOccupancy(F, ProgInfo.LDSSize).second,
       ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 63589d2..9e854fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -54,6 +54,9 @@ private:
 
   MCCodeEmitter *DumpCodeInstEmitter = nullptr;
 
+  // When appropriate, add a _dvgpr$ symbol.
+  void emitDVgprSymbol(MachineFunction &MF);
+
   void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
   void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
                         const SIProgramInfo &KernelInfo,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 3d8d274..d1a5b4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
     ++i;
   }
 
+  if (Info->getNumKernargPreloadedSGPRs())
+    Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
+
   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
   return true;
@@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   if (!determineAssignments(Assigner, SplitArgs, CCInfo))
     return false;
 
+  if (IsEntryFunc) {
+    // This assumes the registers are allocated by CCInfo in ascending order
+    // with no gaps.
+    Info->setNumWaveDispatchSGPRs(
+        CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+    Info->setNumWaveDispatchVGPRs(
+        CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+  }
+
   FormalArgHandler Handler(B, MRI);
   if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
     return false;
@@ -1464,9 +1476,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                    CallLoweringInfo &Info) const {
   if (Function *F = Info.CB->getCalledFunction())
     if (F->isIntrinsic()) {
-      assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
-             "Unexpected intrinsic");
-      return lowerChainCall(MIRBuilder, Info);
+      switch (F->getIntrinsicID()) {
+      case Intrinsic::amdgcn_cs_chain:
+        return lowerChainCall(MIRBuilder, Info);
+      case Intrinsic::amdgcn_call_whole_wave:
+        Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
+
+        // Get the callee from the original instruction, so it doesn't look like
+        // this is an indirect call.
+        Info.Callee = MachineOperand::CreateGA(
+            cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
+        Info.OrigArgs.erase(Info.OrigArgs.begin());
+        Info.IsVarArg = false;
+        break;
+      default:
+        llvm_unreachable("Unexpected intrinsic call");
+      }
     }
 
   if (Info.IsVarArg) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index 74d1fae..d14b5ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
 def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
 def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
 def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
+def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>;
 
 class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
   "wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 9d6584a..04c4d00 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -76,6 +76,40 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
   return false;
 }
 
+static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
+                                        llvm::SelectionDAG *CurDAG,
+                                        const GCNSubtarget *Subtarget) {
+  if (!Subtarget->useRealTrue16Insts()) {
+    return Lo;
+  }
+
+  SDValue NewSrc;
+  SDLoc SL(Lo);
+
+  if (Lo->isDivergent()) {
+    SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                   SL, Lo.getValueType()),
+                            0);
+    const SDValue Ops[] = {
+        CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
+        CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
+        CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
+
+    NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
+                                            Src.getValueType(), Ops),
+                     0);
+  } else {
+    // the S_MOV is needed since the Lo could still be a VGPR16.
+    // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
+    // the fixvgpr2sgprcopy pass to legalize it
+    NewSrc = SDValue(
+        CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
+        0);
+  }
+
+  return NewSrc;
+}
+
 // Look through operations that obscure just looking at the low 16-bits of the
 // same register.
 static SDValue stripExtractLoElt(SDValue In) {
@@ -1162,18 +1196,25 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
   SDLoc SL(N);
   bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
+  SDVTList VTList;
   unsigned Opc;
-  if (Subtarget->hasMADIntraFwdBug())
-    Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
-                 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
-  else
-    Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+  if (Subtarget->hasMadU64U32NoCarry()) {
+    VTList = CurDAG->getVTList(MVT::i64);
+    Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
+  } else {
+    VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
+    if (Subtarget->hasMADIntraFwdBug()) {
+      Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
+                   : AMDGPU::V_MAD_U64_U32_gfx11_e64;
+    } else {
+      Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+    }
+  }
 
   SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
   SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
-  SDNode *Mad = CurDAG->getMachineNode(
-      Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops);
+  SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
   if (!SDValue(N, 0).use_empty()) {
     SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
     SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
@@ -3412,8 +3453,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
       // Really a scalar input. Just select from the low half of the register to
       // avoid packing.
 
-      if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
+      if (VecSize == Lo.getValueSizeInBits()) {
         Src = Lo;
+      } else if (VecSize == 32) {
+        Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
       } else {
         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 64e68ab..8ccd8fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1512,9 +1512,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   const GlobalValue *GV = G->getGlobal();
 
   if (!MFI->isModuleEntryFunction()) {
+    auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
     if (std::optional<uint32_t> Address =
             AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
+      if (IsNamedBarrier) {
+        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+        MFI->recordNumNamedBarriers(Address.value(), BarCnt);
+      }
       return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
+    } else if (IsNamedBarrier) {
+      llvm_unreachable("named barrier should have an assigned address");
     }
   }
 
@@ -1802,16 +1809,36 @@ std::pair<SDValue, SDValue>
 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
                                   const EVT &LoVT, const EVT &HiVT,
                                   SelectionDAG &DAG) const {
+  EVT VT = N.getValueType();
   assert(LoVT.getVectorNumElements() +
                  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
-             N.getValueType().getVectorNumElements() &&
+             VT.getVectorNumElements() &&
          "More vector elements requested than available!");
   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
                            DAG.getVectorIdxConstant(0, DL));
-  SDValue Hi = DAG.getNode(
-      HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
-      HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
-  return std::pair(Lo, Hi);
+
+  unsigned LoNumElts = LoVT.getVectorNumElements();
+
+  if (HiVT.isVector()) {
+    unsigned HiNumElts = HiVT.getVectorNumElements();
+    if ((VT.getVectorNumElements() % HiNumElts) == 0) {
+      // Avoid creating an extract_subvector with an index that isn't a multiple
+      // of the result type.
+      SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
+                               DAG.getConstant(LoNumElts, DL, MVT::i32));
+      return {Lo, Hi};
+    }
+
+    SmallVector<SDValue, 8> Elts;
+    DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
+                              /*Count=*/HiNumElts);
+    SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
+    return {Lo, Hi};
+  }
+
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, HiVT, N,
+                           DAG.getVectorIdxConstant(LoNumElts, DL));
+  return {Lo, Hi};
 }
 
 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
@@ -4002,7 +4029,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
   case Intrinsic::amdgcn_rcp_legacy:
   case Intrinsic::amdgcn_rsq_legacy:
   case Intrinsic::amdgcn_rsq_clamp:
-  case Intrinsic::amdgcn_tanh: {
+  case Intrinsic::amdgcn_tanh:
+  case Intrinsic::amdgcn_prng_b32: {
     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
     SDValue Src = N->getOperand(1);
     return Src.isUndef() ? Src : SDValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b7fd131..5d31eed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2368,8 +2368,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
     return selectDSBvhStackIntrinsic(I);
+  case Intrinsic::amdgcn_s_barrier_init:
   case Intrinsic::amdgcn_s_barrier_signal_var:
     return selectNamedBarrierInit(I, IntrinsicID);
+  case Intrinsic::amdgcn_s_barrier_join:
   case Intrinsic::amdgcn_s_get_named_barrier_state:
     return selectNamedBarrierInst(I, IntrinsicID);
   case Intrinsic::amdgcn_s_get_barrier_state:
@@ -5521,11 +5523,18 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
 
   Register PtrBase;
   int64_t ConstOffset;
-  std::tie(PtrBase, ConstOffset) =
+  bool IsInBounds;
+  std::tie(PtrBase, ConstOffset, IsInBounds) =
       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
 
-  if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
-                           !isFlatScratchBaseLegal(Root.getReg())))
+  // Adding the offset to the base address with an immediate in a FLAT
+  // instruction must not change the memory aperture in which the address falls.
+  // Therefore we can only fold offsets from inbounds GEPs into FLAT
+  // instructions.
+  if (ConstOffset == 0 ||
+      (FlatVariant == SIInstrFlags::FlatScratch &&
+       !isFlatScratchBaseLegal(Root.getReg())) ||
+      (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
     return Default;
 
   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -5577,7 +5586,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
 
   // Match the immediate offset first, which canonically is moved as low as
   // possible.
-  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+  std::tie(PtrBase, ConstOffset, std::ignore) =
+      getPtrBaseWithConstantOffset(Addr, *MRI);
 
   if (ConstOffset != 0) {
     if (NeedIOffset &&
@@ -5760,7 +5770,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
 
   // Match the immediate offset first, which canonically is moved as low as
   // possible.
-  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+  std::tie(PtrBase, ConstOffset, std::ignore) =
+      getPtrBaseWithConstantOffset(Addr, *MRI);
 
   if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
@@ -5836,7 +5847,8 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
 
   // Match the immediate offset first, which canonically is moved as low as
   // possible.
-  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+  std::tie(PtrBase, ConstOffset, std::ignore) =
+      getPtrBaseWithConstantOffset(Addr, *MRI);
 
   Register OrigAddr = Addr;
   if (ConstOffset != 0 &&
@@ -5942,7 +5954,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
   Register PtrBase;
   int64_t ConstOffset;
-  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
+  std::tie(PtrBase, ConstOffset, std::ignore) =
+      getPtrBaseWithConstantOffset(VAddr, *MRI);
   if (ConstOffset != 0) {
     if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
         (!STI.privateMemoryResourceIsRangeChecked() ||
@@ -6181,8 +6194,8 @@ AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const
 
   Register PtrBase;
   int64_t Offset;
-  std::tie(PtrBase, Offset) =
-    getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+  std::tie(PtrBase, Offset, std::ignore) =
+      getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
 
   if (Offset) {
     if (isDSOffsetLegal(PtrBase, Offset)) {
@@ -6243,8 +6256,8 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
 
   Register PtrBase;
   int64_t Offset;
-  std::tie(PtrBase, Offset) =
-    getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+  std::tie(PtrBase, Offset, std::ignore) =
+      getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
 
   if (Offset) {
     int64_t OffsetValue0 = Offset;
@@ -6265,22 +6278,25 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
 }
 
 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
-/// the base value with the constant offset. There may be intervening copies
-/// between \p Root and the identified constant. Returns \p Root, 0 if this does
-/// not match the pattern.
-std::pair<Register, int64_t>
+/// the base value with the constant offset, and if the offset computation is
+/// known to be inbounds. There may be intervening copies between \p Root and
+/// the identified constant. Returns \p Root, 0, false if this does not match
+/// the pattern.
+std::tuple<Register, int64_t, bool>
 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
-  Register Root, const MachineRegisterInfo &MRI) const {
+    Register Root, const MachineRegisterInfo &MRI) const {
   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
-    return {Root, 0};
+    return {Root, 0, false};
 
   MachineOperand &RHS = RootI->getOperand(2);
   std::optional<ValueAndVReg> MaybeOffset =
       getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
   if (!MaybeOffset)
-    return {Root, 0};
-  return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
+    return {Root, 0, false};
+  bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
+  return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
+          IsInBounds};
 }
 
 static void addZeroImm(MachineInstrBuilder &MIB) {
@@ -6358,7 +6374,8 @@ AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
   Register PtrBase;
   int64_t Offset;
 
-  std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
+  std::tie(PtrBase, Offset, std::ignore) =
+      getPtrBaseWithConstantOffset(Src, *MRI);
   if (isUInt<32>(Offset)) {
     Data.N0 = PtrBase;
     Data.Offset = Offset;
@@ -6757,6 +6774,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
     switch (IntrID) {
     default:
       llvm_unreachable("not a named barrier op");
+    case Intrinsic::amdgcn_s_barrier_join:
+      return AMDGPU::S_BARRIER_JOIN_IMM;
     case Intrinsic::amdgcn_s_get_named_barrier_state:
       return AMDGPU::S_GET_BARRIER_STATE_IMM;
     };
@@ -6764,6 +6783,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
     switch (IntrID) {
     default:
       llvm_unreachable("not a named barrier op");
+    case Intrinsic::amdgcn_s_barrier_join:
+      return AMDGPU::S_BARRIER_JOIN_M0;
     case Intrinsic::amdgcn_s_get_named_barrier_state:
       return AMDGPU::S_GET_BARRIER_STATE_M0;
     };
@@ -6814,8 +6835,11 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
       BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
   constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
 
+  unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
+                     ? AMDGPU::S_BARRIER_INIT_M0
+                     : AMDGPU::S_BARRIER_SIGNAL_M0;
   MachineInstrBuilder MIB;
-  MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0));
+  MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
 
   I.eraseFromParent();
   return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c9da419..0924396 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -156,6 +156,7 @@ private:
   bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
   bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
   bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
+  bool selectSBarrierLeave(MachineInstr &I) const;
 
   std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
                                                    bool IsCanonicalizing = true,
@@ -295,7 +296,7 @@ private:
   InstructionSelector::ComplexRendererFns
   selectDSReadWrite2(MachineOperand &Root, unsigned size) const;
 
-  std::pair<Register, int64_t>
+  std::tuple<Register, int64_t, bool>
   getPtrBaseWithConstantOffset(Register Root,
                                const MachineRegisterInfo &MRI) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 523c66c..56113e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -545,7 +545,8 @@ public:
     AU.addRequired<TargetPassConfig>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<UniformityInfoWrapperPass>();
-    AU.setPreservesAll();
+    // Invalidates UniformityInfo
+    AU.setPreservesCFG();
   }
 
   bool runOnFunction(Function &F) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 40d960e..600a130 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
@@ -137,6 +138,14 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
   };
 }
 
+// Retrieves the scalar type that's the same size as the mem desc
+static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+    return std::make_pair(TypeIdx, LLT::scalar(MemSize));
+  };
+}
+
 // Increase the number of vector elements to reach the next legal RegClass.
 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
@@ -384,6 +393,16 @@ static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
   };
 }
 
+// If we have a truncating store or an extending load with a data size larger
+// than 32-bits and mem location is a power of 2
+static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+    return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
+           isPowerOf2_64(MemSize);
+  };
+}
+
 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 // handle some operations by just promoting the register during
 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
@@ -1635,11 +1654,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
               // May need relegalization for the scalars.
               return std::pair(0, EltTy);
             })
-    .minScalar(0, S32)
-    .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
-    .widenScalarToNextPow2(0)
-    .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
-    .lower();
+        .minScalar(0, S32)
+        .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
+                        getScalarTypeFromMemDesc(0))
+        .widenScalarToNextPow2(0)
+        .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
+        .lower();
   }
 
   // FIXME: Unaligned accesses not lowered.
@@ -5653,7 +5673,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   unsigned SplitSize = 32;
   if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
       ST.hasDPALU_DPP() &&
-      AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
+      AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
     SplitSize = 64;
 
   if (Size == SplitSize) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 304e91e..139cad6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -599,8 +599,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) {
 
   IRB.SetInsertPoint(&SI);
   Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName());
-  for (auto *Dbg : at::getAssignmentMarkers(&SI))
-    Dbg->setValue(IntV);
+  for (auto *Dbg : at::getDVRAssignmentMarkers(&SI))
+    Dbg->setRawLocation(ValueAsMetadata::get(IntV));
 
   SI.setOperand(0, IntV);
   return true;
@@ -1361,6 +1361,7 @@ public:
   PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI);
   PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP);
 
+  PtrParts visitPtrToAddrInst(PtrToAddrInst &PA);
   PtrParts visitPtrToIntInst(PtrToIntInst &PI);
   PtrParts visitIntToPtrInst(IntToPtrInst &IP);
   PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I);
@@ -1954,6 +1955,21 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) {
   return {nullptr, nullptr};
 }
 
+PtrParts SplitPtrStructs::visitPtrToAddrInst(PtrToAddrInst &PA) {
+  Value *Ptr = PA.getPointerOperand();
+  if (!isSplitFatPtr(Ptr->getType()))
+    return {nullptr, nullptr};
+  IRB.SetInsertPoint(&PA);
+
+  auto [Rsrc, Off] = getPtrParts(Ptr);
+  Value *Res = IRB.CreateIntCast(Off, PA.getType(), /*isSigned=*/false);
+  copyMetadata(Res, &PA);
+  Res->takeName(&PA);
+  SplitUsers.insert(&PA);
+  PA.replaceAllUsesWith(Res);
+  return {nullptr, nullptr};
+}
+
 PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) {
   if (!isSplitFatPtr(IP.getType()))
     return {nullptr, nullptr};
@@ -2350,8 +2366,12 @@ static bool containsBufferFatPointers(const Function &F,
                                       BufferFatPtrToStructTypeMap *TypeMap) {
   bool HasFatPointers = false;
   for (const BasicBlock &BB : F)
-    for (const Instruction &I : BB)
+    for (const Instruction &I : BB) {
       HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType()));
+      // Catch null pointer constants in loads, stores, etc.
+      for (const Value *V : I.operand_values())
+        HasFatPointers |= (V->getType() != TypeMap->remapType(V->getType()));
+    }
   return HasFatPointers;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 6390853..6b3cdf5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -39,6 +39,8 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
     return GOCS(".num_agpr");
   case RIK_NumSGPR:
     return GOCS(".numbered_sgpr");
+  case RIK_NumNamedBarrier:
+    return GOCS(".num_named_barrier");
   case RIK_PrivateSegSize:
     return GOCS(".private_seg_size");
   case RIK_UsesVCC:
@@ -66,6 +68,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
   MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
   MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
   MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
+  MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
 
   auto assignMaxRegSym = [&OutContext](MCSymbol *Sym, int32_t RegCount) {
     const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
@@ -75,6 +78,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
   assignMaxRegSym(MaxVGPRSym, MaxVGPR);
   assignMaxRegSym(MaxAGPRSym, MaxAGPR);
   assignMaxRegSym(MaxSGPRSym, MaxSGPR);
+  assignMaxRegSym(MaxNamedBarrierSym, MaxNamedBarrier);
 }
 
 void MCResourceInfo::reset() { *this = MCResourceInfo(); }
@@ -97,6 +101,10 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
   return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
 }
 
+MCSymbol *MCResourceInfo::getMaxNamedBarrierSymbol(MCContext &OutContext) {
+  return OutContext.getOrCreateSymbol("amdgpu.max_num_named_barrier");
+}
+
 // Tries to flatten recursive call register resource gathering. Simple cycle
 // avoiding dfs to find the constants in the propagated symbols.
 // Assumes:
@@ -227,6 +235,10 @@ void MCResourceInfo::assignResourceInfoExpr(
         case RIK_NumAGPR:
           ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
           break;
+        case RIK_NumNamedBarrier:
+          ArgExprs.push_back(MCSymbolRefExpr::create(
+              getMaxNamedBarrierSymbol(OutContext), OutContext));
+          break;
         }
       }
     }
@@ -245,11 +257,13 @@ void MCResourceInfo::gatherResourceInfo(
   MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
   MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
   bool IsLocal = MF.getFunction().hasLocalLinkage();
+  MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
 
   if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
     addMaxVGPRCandidate(FRI.NumVGPR);
     addMaxAGPRCandidate(FRI.NumAGPR);
     addMaxSGPRCandidate(FRI.NumExplicitSGPR);
+    addMaxNamedBarrierCandidate(FRI.NumNamedBarrier);
   }
 
   const TargetMachine &TM = MF.getTarget();
@@ -288,6 +302,7 @@ void MCResourceInfo::gatherResourceInfo(
   SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
   SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
   SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+  SetMaxReg(MaxNamedBarrierSym, FRI.NumNamedBarrier, RIK_NumNamedBarrier);
 
   {
     // The expression for private segment size should be: FRI.PrivateSegmentSize
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 297e93b..b605516 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -31,6 +31,7 @@ public:
     RIK_NumVGPR,
     RIK_NumAGPR,
     RIK_NumSGPR,
+    RIK_NumNamedBarrier,
     RIK_PrivateSegSize,
     RIK_UsesVCC,
     RIK_UsesFlatScratch,
@@ -43,6 +44,7 @@ private:
   int32_t MaxVGPR = 0;
   int32_t MaxAGPR = 0;
   int32_t MaxSGPR = 0;
+  int32_t MaxNamedBarrier = 0;
 
   // Whether the MCResourceInfo has been finalized through finalize(MCContext
   // &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR
@@ -75,6 +77,9 @@ public:
   void addMaxSGPRCandidate(int32_t candidate) {
     MaxSGPR = std::max(MaxSGPR, candidate);
   }
+  void addMaxNamedBarrierCandidate(int32_t candidate) {
+    MaxNamedBarrier = std::max(MaxNamedBarrier, candidate);
+  }
 
   MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
                       MCContext &OutContext, bool IsLocal);
@@ -90,6 +95,7 @@ public:
   MCSymbol *getMaxVGPRSymbol(MCContext &OutContext);
   MCSymbol *getMaxAGPRSymbol(MCContext &OutContext);
   MCSymbol *getMaxSGPRSymbol(MCContext &OutContext);
+  MCSymbol *getMaxNamedBarrierSymbol(MCContext &OutContext);
 
   /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
   /// granularity. However, some resource info has to be assigned the call
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0c82cace..664a15c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -107,6 +107,8 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
       if (!BarAddr)
         llvm_unreachable("named barrier should have an assigned address");
       Entry.first->second = BarAddr.value();
+      unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16;
+      recordNumNamedBarriers(BarAddr.value(), BarCnt);
       return BarAddr.value();
     }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index b1022e4..fc64e16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -49,6 +49,8 @@ protected:
   // Flag to check dynamic LDS usage by kernel.
   bool UsesDynamicLDS = false;
 
+  uint32_t NumNamedBarriers = 0;
+
   // Kernels + shaders. i.e. functions called by the hardware and not called
   // by other functions.
   bool IsEntryFunction = false;
@@ -86,6 +88,12 @@ public:
     return GDSSize;
   }
 
+  void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
+    NumNamedBarriers =
+        std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);
+  }
+  uint32_t getNumNamedBarriers() const { return NumNamedBarriers; }
+
   bool isEntryFunction() const {
     return IsEntryFunction;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index aa72c3e..dfe7c53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -352,7 +352,10 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
     case Intrinsic::amdgcn_s_barrier_signal:
     case Intrinsic::amdgcn_s_barrier_signal_var:
     case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+    case Intrinsic::amdgcn_s_barrier_init:
+    case Intrinsic::amdgcn_s_barrier_join:
     case Intrinsic::amdgcn_s_barrier_wait:
+    case Intrinsic::amdgcn_s_barrier_leave:
     case Intrinsic::amdgcn_s_get_barrier_state:
     case Intrinsic::amdgcn_wave_barrier:
     case Intrinsic::amdgcn_sched_barrier:
@@ -381,7 +384,7 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
                            AAResults *AA) {
   MemorySSAWalker *Walker = MSSA->getWalker();
   SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
-  SmallSet<MemoryAccess *, 8> Visited;
+  SmallPtrSet<MemoryAccess *, 8> Visited;
   MemoryLocation Loc(MemoryLocation::get(Load));
 
   LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b6c6d92..6ddfa38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,7 +29,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 3a37518..28d5400 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -134,8 +134,8 @@ static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
 
 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
   LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
-  SmallSet<const Value *, 32> WorkSet;
-  SmallSet<const Value *, 32> Visited;
+  SmallPtrSet<const Value *, 32> WorkSet;
+  SmallPtrSet<const Value *, 32> Visited;
   if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
     if (isGlobalAddr(MO))
       WorkSet.insert(MO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
index 4009451..90c4f4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
@@ -109,7 +109,7 @@ AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)
       TRI(*ST.getRegisterInfo()) {}
 
 bool AMDGPUPreloadKernArgProlog::run() {
-  if (!ST.hasKernargPreload())
+  if (!ST.needsKernArgPreloadProlog())
     return false;
 
   unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
index 984c1ee..a386fe6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
@@ -37,6 +37,11 @@ static cl::opt<unsigned> KernargPreloadCount(
     "amdgpu-kernarg-preload-count",
     cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
 
+static cl::opt<bool>
+    EnableKernargPreload("amdgpu-kernarg-preload",
+                         cl::desc("Enable preload kernel arguments to SGPRs"),
+                         cl::init(true));
+
 namespace {
 
 class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {
@@ -275,6 +280,9 @@ AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
     : ModulePass(ID), TM(TM) {}
 
 static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) {
+  if (!EnableKernargPreload)
+    return false;
+
   SmallVector<Function *, 4> FunctionsToErase;
   bool Changed = false;
   for (auto &F : M) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 5a6ad40..8c56c21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -724,10 +724,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
 
   addRulesForGOpcs({G_PTR_ADD})
-      .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
-      .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
-      .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}})
-      .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}});
+      .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
+      .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
+      .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
+      .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
 
   addRulesForGOpcs({G_INTTOPTR})
       .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 868b1a2..2379296 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3342,6 +3342,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       assert(OpdMapper.getVRegs(1).empty());
       constrainOpWithReadfirstlane(B, MI, 1);
       return;
+    case Intrinsic::amdgcn_s_barrier_join:
+      constrainOpWithReadfirstlane(B, MI, 1);
+      return;
+    case Intrinsic::amdgcn_s_barrier_init:
     case Intrinsic::amdgcn_s_barrier_signal_var:
       constrainOpWithReadfirstlane(B, MI, 1);
       constrainOpWithReadfirstlane(B, MI, 2);
@@ -5515,6 +5519,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_s_sleep_var:
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       break;
+    case Intrinsic::amdgcn_s_barrier_join:
+      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      break;
+    case Intrinsic::amdgcn_s_barrier_init:
     case Intrinsic::amdgcn_s_barrier_signal_var:
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
index e2e5c57..d2ec7dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -195,13 +195,17 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
 
   // Delete FeatureWavefrontSize32 functions for
   // gfx9 and below targets that don't support the mode.
-  // gfx10+ is implied to support both wave32 and 64 features.
+  // gfx10, gfx11, gfx12 are implied to support both wave32 and 64 features.
   // They are not in the feature set. So, we need a separate check
-  if (ST->getGeneration() < AMDGPUSubtarget::GFX10 &&
-      ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
+  if (!ST->supportsWave32() && ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
     reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize32);
     return true;
   }
+  // gfx125x only support FeatureWavefrontSize32.
+  if (!ST->supportsWave64() && ST->hasFeature(AMDGPU::FeatureWavefrontSize64)) {
+    reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize64);
+    return true;
+  }
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 8101c68..0ea9add 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -142,6 +142,8 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
                          MRI.isLiveIn(MFI->getPreloadedReg(
                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
 
+  Info.NumNamedBarrier = MFI->getNumNamedBarriers();
+
   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
   // instructions aren't used to access the scratch buffer. Inline assembly may
   // need it though.
@@ -241,6 +243,9 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
         if (!RC || !TRI.isVGPRClass(RC))
           continue;
 
+        if (MI.isCall() || MI.isMetaInstruction())
+          continue;
+
         unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
         unsigned HWReg = TRI.getHWRegIndex(Reg);
         int MaxUsed = HWReg + Width - 1;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index acfff96..9ae3bb3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -35,6 +35,7 @@ public:
     int32_t NumVGPR = 0;
     int32_t NumAGPR = 0;
     int32_t NumExplicitSGPR = 0;
+    int32_t NumNamedBarrier = 0;
     uint64_t CalleeSegmentSize = 0;
     uint64_t PrivateSegmentSize = 0;
     bool UsesVCC = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index f580f43..20b5fd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -57,27 +57,47 @@ public:
         TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
         LIS(LIS) {}
 
+  // TODO: Remove this restriction
+  bool mfmaHasSameSrc2AndDstReg(const MachineInstr &MI) const {
+    const MachineOperand *Src2 = TII.getNamedOperand(MI, AMDGPU::OpName::src2);
+    const MachineOperand *Dst = TII.getNamedOperand(MI, AMDGPU::OpName::vdst);
+    return Src2->getReg() == Dst->getReg() &&
+           Src2->getSubReg() == Dst->getSubReg();
+  }
+
+  bool isRewriteCandidate(const MachineInstr &MI) const {
+    return TII.isMAI(MI) &&
+           AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1 &&
+           mfmaHasSameSrc2AndDstReg(MI);
+  }
+
   /// Compute the register class constraints based on the uses of \p Reg,
-  /// excluding uses from \p ExceptMI. This should be nearly identical to
+  /// excluding MFMA uses from which can be rewritten to change the register
+  /// class constraint. This should be nearly identical to
   /// MachineRegisterInfo::recomputeRegClass.
   const TargetRegisterClass *
-  recomputeRegClassExcept(Register Reg, const TargetRegisterClass *OldRC,
-                          const TargetRegisterClass *NewRC,
-                          const MachineInstr *ExceptMI) const;
+  recomputeRegClassExceptRewritable(Register Reg,
+                                    const TargetRegisterClass *OldRC,
+                                    const TargetRegisterClass *NewRC) const;
 
   bool run(MachineFunction &MF) const;
 };
 
 const TargetRegisterClass *
-AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExcept(
+AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
     Register Reg, const TargetRegisterClass *OldRC,
-    const TargetRegisterClass *NewRC, const MachineInstr *ExceptMI) const {
+    const TargetRegisterClass *NewRC) const {
 
   // Accumulate constraints from all uses.
   for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
     // Apply the effect of the given operand to NewRC.
     MachineInstr *MI = MO.getParent();
-    if (MI == ExceptMI)
+
+    // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
+    // effects of rewrite candidates. It just so happens that we can use either
+    // AGPR or VGPR in src0/src1, so don't bother checking the constraint
+    // effects of the individual operands.
+    if (isRewriteCandidate(*MI))
       continue;
 
     unsigned OpNo = &MO - &MI->getOperand(0);
@@ -96,8 +116,10 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
     return false;
 
   // Early exit if no AGPRs were assigned.
-  if (!LRM.isPhysRegUsed(AMDGPU::AGPR0))
+  if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) {
+    LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n");
     return false;
+  }
 
   bool MadeChange = false;
 
@@ -109,17 +131,25 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
 
     // Find AV_* registers assigned to AGPRs.
     const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg);
-    if (!TRI.isVectorSuperClass(VirtRegRC))
+    if (!TRI.hasAGPRs(VirtRegRC))
       continue;
 
-    const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
-    if (!TRI.isAGPRClass(AssignedRC))
-      continue;
+    const TargetRegisterClass *AssignedRC = VirtRegRC;
+    if (TRI.hasVGPRs(VirtRegRC)) {
+      // If this is an AV register, we have to check if the actual assignment is
+      // to an AGPR
+      AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
+      if (!TRI.isAGPRClass(AssignedRC))
+        continue;
+    }
 
     LiveInterval &LI = LIS.getInterval(VReg);
 
     // TODO: Test multiple uses
     for (VNInfo *VNI : LI.vnis()) {
+      if (VNI->isPHIDef() || VNI->isUnused())
+        continue;
+
       MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
 
       // TODO: Handle SplitKit produced copy bundles for partially defined
@@ -183,10 +213,13 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
       // first place, as well as need to assign another register, and need to
       // figure out where to put them. The live range splitting is smarter than
       // anything we're doing here, so trust it did something reasonable.
-      const TargetRegisterClass *Src2ExceptRC = recomputeRegClassExcept(
-          Src2->getReg(), Src2VirtRegRC, VirtRegRC, CopySrcMI);
-      if (!Src2ExceptRC)
+      const TargetRegisterClass *Src2ExceptRC =
+          recomputeRegClassExceptRewritable(Src2->getReg(), Src2VirtRegRC,
+                                            VirtRegRC);
+      if (!Src2ExceptRC) {
+        LLVM_DEBUG(dbgs() << "Could not recompute the regclass\n");
         continue;
+      }
 
       const TargetRegisterClass *NewSrc2ConstraintRC =
           TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF);
@@ -196,8 +229,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
       const TargetRegisterClass *NewSrc2RC =
           TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC);
       if (!NewSrc2RC) {
-        // TODO: This is ignoring ther rewritable uses. e.g. a rewritable MFMA
-        // using a rewritable MFMA can be rewritten as a pair.
         LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI)
                           << " are incompatible with replacement class\n");
         continue;
@@ -208,8 +239,19 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
 
       CopySrcMI->setDesc(TII.get(AGPROp));
 
-      // TODO: Is replacing too aggressive, fixup these instructions only?
-      MRI.replaceRegWith(CopySrcReg, VReg);
+      // Perform replacement of the register, rewriting the rewritable uses.
+      for (MachineInstr &UseMI :
+           make_early_inc_range(MRI.reg_instructions(CopySrcReg))) {
+        if (TII.isMAI(UseMI)) {
+          // Note the register we need to rewrite may still appear in src0/src1,
+          // but that's fine since those can use A or V anyway.
+          int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(UseMI.getOpcode());
+          if (ReplacementOp != -1)
+            UseMI.setDesc(TII.get(ReplacementOp));
+        }
+
+        UseMI.substituteRegister(CopySrcReg, VReg, AMDGPU::NoSubRegister, TRI);
+      }
 
       LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 10b8606..7be1899 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -378,6 +378,7 @@ foreach intr = AMDGPUImageDimAtomicIntrinsics in
 def : SourceOfDivergence<intr>;
 
 def : SourceOfDivergence<int_amdgcn_dead>;
+def : SourceOfDivergence<int_amdgcn_call_whole_wave>;
 
 class AlwaysUniform<Intrinsic intr> {
   Intrinsic Intr = intr;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
index b60ded3..56aa3f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -195,7 +195,7 @@ bool AMDGPUSetWavePriority::run(MachineFunction &MF) {
 
   // Lower the priority on edges where control leaves blocks from which
   // the VMEM loads are reachable.
-  SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
+  SmallPtrSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
   for (MachineBasicBlock &MBB : MF) {
     if (MBBInfos[&MBB].MayReachVMEMLoad) {
       if (MBB.succ_empty())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c1f1703..e393aa19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -848,8 +848,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         if (Level == OptimizationLevel::O0)
           return;
 
-        PM.addPass(AMDGPUUnifyMetadataPass());
-
         // We don't want to run internalization at per-module stage.
         if (InternalizeSymbols && !isLTOPreLink(Phase)) {
           PM.addPass(InternalizePass(mustPreserveGV));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
deleted file mode 100644
index e400491..0000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// This pass that unifies multiple OpenCL metadata due to linking.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-namespace {
-
-  namespace kOCLMD {
-
-    const char SpirVer[]            = "opencl.spir.version";
-    const char OCLVer[]             = "opencl.ocl.version";
-    const char UsedExt[]            = "opencl.used.extensions";
-    const char UsedOptCoreFeat[]    = "opencl.used.optional.core.features";
-    const char CompilerOptions[]    = "opencl.compiler.options";
-    const char LLVMIdent[]          = "llvm.ident";
-
-  } // end namespace kOCLMD
-
-    /// Unify version metadata.
-    /// \return true if changes are made.
-    /// Assume the named metadata has operands each of which is a pair of
-    /// integer constant, e.g.
-    /// !Name = {!n1, !n2}
-    /// !n1 = {i32 1, i32 2}
-    /// !n2 = {i32 2, i32 0}
-    /// Keep the largest version as the sole operand if PickFirst is false.
-    /// Otherwise pick it from the first value, representing kernel module.
-    bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) {
-      auto *NamedMD = M.getNamedMetadata(Name);
-      if (!NamedMD || NamedMD->getNumOperands() <= 1)
-        return false;
-      MDNode *MaxMD = nullptr;
-      auto MaxVer = 0U;
-      for (auto *VersionMD : NamedMD->operands()) {
-        assert(VersionMD->getNumOperands() == 2);
-        auto *CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0));
-        auto VersionMajor = CMajor->getZExtValue();
-        auto *CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1));
-        auto VersionMinor = CMinor->getZExtValue();
-        auto Ver = (VersionMajor * 100) + (VersionMinor * 10);
-        if (Ver > MaxVer) {
-          MaxVer = Ver;
-          MaxMD = VersionMD;
-        }
-        if (PickFirst)
-          break;
-      }
-      NamedMD->eraseFromParent();
-      NamedMD = M.getOrInsertNamedMetadata(Name);
-      NamedMD->addOperand(MaxMD);
-      return true;
-    }
-
-  /// Unify version metadata.
-  /// \return true if changes are made.
-  /// Assume the named metadata has operands each of which is a list e.g.
-  /// !Name = {!n1, !n2}
-  /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}}
-  /// !n2 = !{!"cl_khr_image"}
-  /// Combine it into a single list with unique operands.
-  bool unifyExtensionMD(Module &M, StringRef Name) {
-    auto *NamedMD = M.getNamedMetadata(Name);
-    if (!NamedMD || NamedMD->getNumOperands() == 1)
-      return false;
-
-    SmallVector<Metadata *, 4> All;
-    for (auto *MD : NamedMD->operands())
-      for (const auto &Op : MD->operands())
-        if (!llvm::is_contained(All, Op.get()))
-          All.push_back(Op.get());
-
-    NamedMD->eraseFromParent();
-    NamedMD = M.getOrInsertNamedMetadata(Name);
-    for (const auto &MD : All)
-      NamedMD->addOperand(MDNode::get(M.getContext(), MD));
-
-    return true;
-  }
-
-  /// Unify multiple OpenCL metadata due to linking.
-  bool unifyMetadataImpl(Module &M) {
-    const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer};
-    const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat,
-                          kOCLMD::CompilerOptions, kOCLMD::LLVMIdent};
-
-    bool Changed = false;
-
-    for (auto &I : Vers)
-      Changed |= unifyVersionMD(M, I, true);
-
-    for (auto &I : Exts)
-      Changed |= unifyExtensionMD(M, I);
-
-    return Changed;
-  }
-
-  } // end anonymous namespace
-
-  PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M,
-                                                 ModuleAnalysisManager &AM) {
-    return unifyMetadataImpl(M) ? PreservedAnalyses::none()
-                                : PreservedAnalyses::all();
-  }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0d2feeb..9514732 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5052,11 +5052,13 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
   if (DppCtrlIdx >= 0) {
     unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
 
-    if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
-        AMDGPU::isDPALU_DPP(MII.get(Opc))) {
-      // DP ALU DPP is supported for row_newbcast only on GFX9*
+    if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) &&
+        AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) {
+      // DP ALU DPP is supported for row_newbcast only on GFX9* and row_share
+      // only on GFX12.
       SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
-      Error(S, "DP ALU dpp only supports row_newbcast");
+      Error(S, isGFX12() ? "DP ALU dpp only supports row_share"
+                         : "DP ALU dpp only supports row_newbcast");
       return false;
     }
   }
@@ -6268,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
                        ExprVal, ValRange);
     } else if (ID == ".amdhsa_workgroup_processor_mode") {
-      if (IVersion.Major < 10)
-        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+      if (!supportsWGP(getSTI()))
+        return Error(IDRange.Start,
+                     "directive unsupported on " + getSTI().getCPU(), IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
                        COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
                        ValRange);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c466f9c..dc9dd22 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -114,7 +114,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUTargetTransformInfo.cpp
   AMDGPUWaitSGPRHazards.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
-  AMDGPUUnifyMetadata.cpp
   R600MachineCFGStructurizer.cpp
   GCNCreateVOPD.cpp
   GCNDPPCombine.cpp
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fb7d634..070de00 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2422,8 +2422,18 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
                                    "must be zero on gfx10 or gfx11");
     }
 
-    // Bits [14-30].
-    CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4,
+    // Bits [14-16]
+    if (isGFX1250()) {
+      PRINT_DIRECTIVE(".amdhsa_named_barrier_count",
+                      COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT);
+    } else {
+      CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX120_RESERVED4,
+                                   "COMPUTE_PGM_RSRC3",
+                                   "must be zero on gfx10+");
+    }
+
+    // Bits [17-30].
+    CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED5,
                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
 
     // Bits [31].
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index d5d1074..f5d4384 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1274,7 +1274,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
   defm GLOBAL_LOAD_TR_B64_w64   : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
 }
 
-let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in {
+let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in {
   defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>;
   defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index f9a907a..184929a 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -421,6 +421,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
           AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
         DPPInst.addImm(ByteSelOpr->getImm());
       }
+      if (MachineOperand *BitOp3 =
+              TII->getNamedOperand(OrigMI, AMDGPU::OpName::bitop3)) {
+        assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::bitop3));
+        DPPInst.add(*BitOp3);
+      }
     }
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
@@ -544,11 +549,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
     return false;
   }
 
-  if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
-      MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
-    auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
-    assert(DppCtrl && DppCtrl->isImm());
-    if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) {
+  auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
+  assert(DppCtrl && DppCtrl->isImm());
+  unsigned DppCtrlVal = DppCtrl->getImm();
+  if ((MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
+       MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp)) {
+    if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP)) {
+      LLVM_DEBUG(dbgs() << "  failed: 64 bit dpp move is unsupported\n");
+      // Split it.
+      return false;
+    }
+    if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal)) {
       LLVM_DEBUG(dbgs() << "  failed: 64 bit dpp move uses unsupported"
                            " control value\n");
       // Let it split, then control may become legal.
@@ -704,6 +715,20 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
       break;
     }
 
+    if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP) &&
+        AMDGPU::isDPALU_DPP32BitOpc(OrigOp)) {
+      LLVM_DEBUG(dbgs() << "  " << OrigMI
+                        << "  failed: DPP ALU DPP is not supported\n");
+      break;
+    }
+
+    if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) &&
+        AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) {
+      LLVM_DEBUG(dbgs() << "  " << OrigMI
+                        << "  failed: not valid 64-bit DPP control value\n");
+      break;
+    }
+
     LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
     if (Use == Src0) {
       if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 96cb5ae..a3b64ae 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1200,6 +1200,14 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixShift64HighRegBug(MI);
   fixVALUMaskWriteHazard(MI);
   fixRequiredExportPriority(MI);
+  if (ST.requiresWaitIdleBeforeGetReg())
+    fixGetRegWaitIdle(MI);
+  if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
+    fixDsAtomicAsyncBarrierArriveB64(MI);
+  if (ST.hasScratchBaseForwardingHazard())
+    fixScratchBaseForwardingHazard(MI);
+  if (ST.setRegModeNeedsVNOPs())
+    fixSetRegMode(MI);
 }
 
 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
@@ -1350,6 +1358,9 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
         return (Decoded.DsCnt == 0);
       }
       default:
+        assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
+                MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
+               "unexpected wait count instruction");
         // SOPP instructions cannot mitigate the hazard.
         if (TII->isSOPP(MI))
           return false;
@@ -1731,7 +1742,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
 
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(0x0fff);
+      .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
 
   return true;
 }
@@ -1781,7 +1792,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
         SIInstrInfo::isEXP(I) ||
         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
-         I.getOperand(0).getImm() == 0x0fff))
+         AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
       return HazardExpired;
 
     // Track registers writes
@@ -2239,19 +2250,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
       return true;
 
-    switch (MI.getOpcode()) {
-    case AMDGPU::S_WAITCNT:
-    case AMDGPU::S_WAITCNT_VSCNT:
-    case AMDGPU::S_WAITCNT_VMCNT:
-    case AMDGPU::S_WAITCNT_EXPCNT:
-    case AMDGPU::S_WAITCNT_LGKMCNT:
-    case AMDGPU::S_WAIT_IDLE:
-      return true;
-    default:
-      break;
-    }
-
-    return false;
+    return SIInstrInfo::isWaitcnt(MI.getOpcode());
   };
 
   return FPAtomicToDenormModeWaitStates -
@@ -3428,3 +3427,125 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
 
   return true;
 }
+
+bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
+  if (!isSGetReg(MI->getOpcode()))
+    return false;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  switch (getHWReg(TII, *MI)) {
+  default:
+    return false;
+  case AMDGPU::Hwreg::ID_STATUS:
+  case AMDGPU::Hwreg::ID_STATE_PRIV:
+  case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV:
+  case AMDGPU::Hwreg::ID_EXCP_FLAG_USER:
+    break;
+  }
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(0);
+  return true;
+}
+
+bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
+  if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
+    return false;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(0xFFE3);
+  BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
+          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(0xFFE3);
+
+  return true;
+}
+
+bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
+  // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
+  // for hazard to trigger.
+  if (!IsHazardRecognizerMode)
+    return false;
+
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
+  const int FlatScrBaseWaitStates = 10;
+
+  bool ReadsFlatScrLo =
+      MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
+  bool ReadsFlatScrHi =
+      MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
+  if (isSGetReg(MI->getOpcode())) {
+    switch (getHWReg(TII, *MI)) {
+    default:
+      break;
+    case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
+      ReadsFlatScrLo = true;
+      break;
+    case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
+      ReadsFlatScrHi = true;
+      break;
+    }
+  }
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  auto IsRegDefHazard = [&](Register Reg) -> bool {
+    DenseSet<const MachineBasicBlock *> Visited;
+    auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
+      return MI.modifiesRegister(Reg, TRI);
+    };
+
+    // This literally abuses the idea of waitstates. Instead of waitstates it
+    // returns 1 for SGPR written and 0 otherwise.
+    auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
+      if (!TII->isSALU(MI) && !TII->isVALU(MI))
+        return 0;
+      for (const MachineOperand &MO : MI.all_defs()) {
+        if (TRI->isSGPRReg(MRI, MO.getReg()))
+          return 1;
+      }
+      return 0;
+    };
+
+    auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
+      if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
+        unsigned Wait = MI.getOperand(0).getImm();
+        if (AMDGPU::DepCtr::decodeFieldSaSdst(Wait) == 0 &&
+            AMDGPU::DepCtr::decodeFieldVaSdst(Wait) == 0)
+          return true;
+      }
+      return SgprWrites >= FlatScrBaseWaitStates;
+    };
+
+    return ::getWaitStatesSince(
+               IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
+               0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
+  };
+
+  if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
+       !IsRegDefHazard(AMDGPU::SGPR102)) &&
+      (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
+       !IsRegDefHazard(AMDGPU::SGPR103)))
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(AMDGPU::DepCtr::encodeFieldVaSdst(
+          AMDGPU::DepCtr::encodeFieldSaSdst(0), 0));
+  return true;
+}
+
+bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
+  if (!isSSetReg(MI->getOpcode()) ||
+      MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f796eeae..67beffa 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -110,6 +110,10 @@ private:
   bool fixShift64HighRegBug(MachineInstr *MI);
   bool fixVALUMaskWriteHazard(MachineInstr *MI);
   bool fixRequiredExportPriority(MachineInstr *MI);
+  bool fixGetRegWaitIdle(MachineInstr *MI);
+  bool fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI);
+  bool fixScratchBaseForwardingHazard(MachineInstr *MI);
+  bool fixSetRegMode(MachineInstr *MI);
 
   int checkMAIHazards(MachineInstr *MI);
   int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 334afd3..ef63acc 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
 ////////////////////////////////////////////////////////////////////////////////
 // GCNRPTarget
 
-GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
-                         bool CombineVGPRSavings)
-    : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP)
+    : GCNRPTarget(RP, MF) {
   const Function &F = MF.getFunction();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF);
+  setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F));
 }
 
 GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs,
-                         const MachineFunction &MF, const GCNRegPressure &RP,
-                         bool CombineVGPRSavings)
-    : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
-  setRegLimits(NumSGPRs, NumVGPRs, MF);
+                         const MachineFunction &MF, const GCNRegPressure &RP)
+    : GCNRPTarget(RP, MF) {
+  setTarget(NumSGPRs, NumVGPRs);
 }
 
 GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
-                         const GCNRegPressure &RP, bool CombineVGPRSavings)
-    : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+                         const GCNRegPressure &RP)
+    : GCNRPTarget(RP, MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-  setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
-               ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF);
+  setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
+            ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize));
 }
 
-void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
-                               const MachineFunction &MF) {
+void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  unsigned DynamicVGPRBlockSize =
-      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
   MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
   MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
-  MaxUnifiedVGPRs =
-      ST.hasGFX90AInsts()
-          ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
-          : 0;
+  if (UnifiedRF) {
+    unsigned DynamicVGPRBlockSize =
+        MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+    MaxUnifiedVGPRs =
+        std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs);
+  } else {
+    MaxUnifiedVGPRs = 0;
+  }
 }
 
-bool GCNRPTarget::isSaveBeneficial(Register Reg,
-                                   const MachineRegisterInfo &MRI) const {
+bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
   const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
@@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
     return RP.getSGPRNum() > MaxSGPRs;
   unsigned NumVGPRs =
       SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
-  return isVGPRBankSaveBeneficial(NumVGPRs);
+  // The addressable limit must always be respected.
+  if (NumVGPRs > MaxVGPRs)
+    return true;
+  // For unified RFs, combined VGPR usage limit must be respected as well.
+  return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs;
 }
 
 bool GCNRPTarget::satisfied() const {
-  if (RP.getSGPRNum() > MaxSGPRs)
+  if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs)
     return false;
-  if (RP.getVGPRNum(false) > MaxVGPRs &&
-      (!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
+  if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs)
     return false;
-  return satisfiesUnifiedTarget();
+  return true;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ea33a22..a9c58bb 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -186,20 +186,22 @@ public:
   /// Sets up the target such that the register pressure starting at \p RP does
   /// not show register spilling on function \p MF (w.r.t. the function's
   /// mininum target occupancy).
-  GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
-              bool CombineVGPRSavings = false);
+  GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP);
 
   /// Sets up the target such that the register pressure starting at \p RP does
   /// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p
   /// MF.
   GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF,
-              const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+              const GCNRegPressure &RP);
 
   /// Sets up the target such that the register pressure starting at \p RP does
   /// not prevent achieving an occupancy of at least \p Occupancy on function
   /// \p MF.
   GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
-              const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+              const GCNRegPressure &RP);
+
+  /// Changes the target (same semantics as constructor).
+  void setTarget(unsigned NumSGPRs, unsigned NumVGPRs);
 
   const GCNRegPressure &getCurrentRP() const { return RP; }
 
@@ -207,7 +209,7 @@ public:
 
   /// Determines whether saving virtual register \p Reg will be beneficial
   /// towards achieving the RP target.
-  bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const;
+  bool isSaveBeneficial(Register Reg) const;
 
   /// Saves virtual register \p Reg with lanemask \p Mask.
   void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) {
@@ -227,15 +229,15 @@ public:
     if (Target.MaxUnifiedVGPRs) {
       OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
          << " VGPRs (unified)";
-    } else if (Target.CombineVGPRSavings) {
-      OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
-         << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
     }
     return OS;
   }
 #endif
 
 private:
+  const MachineFunction &MF;
+  const bool UnifiedRF;
+
   /// Current register pressure.
   GCNRegPressure RP;
 
@@ -246,29 +248,10 @@ private:
   /// Target number of overall VGPRs for subtargets with unified RFs. Always 0
   /// for subtargets with non-unified RFs.
   unsigned MaxUnifiedVGPRs;
-  /// Whether we consider that the register allocator will be able to swap
-  /// between ArchVGPRs and AGPRs by copying them to a super register class.
-  /// Concretely, this allows savings in one of the VGPR banks to help toward
-  /// savings in the other VGPR bank.
-  bool CombineVGPRSavings;
-
-  inline bool satisifiesVGPRBanksTarget() const {
-    assert(CombineVGPRSavings && "only makes sense with combined savings");
-    return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
-  }
-
-  /// Always satisified when the subtarget doesn't have a unified RF.
-  inline bool satisfiesUnifiedTarget() const {
-    return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
-  }
-
-  inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
-    return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() ||
-           (CombineVGPRSavings && !satisifiesVGPRBanksTarget());
-  }
 
-  void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs,
-                    const MachineFunction &MF);
+  GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF)
+      : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()),
+        RP(RP) {}
 };
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 96d5668..254b75b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
 }
 
 /// Allows to easily filter for this stage's debug output.
-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+#define REMAT_PREFIX "[PreRARemat] "
+#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
 
 bool PreRARematStage::initGCNSchedStage() {
   // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
@@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() {
   rematerialize();
   if (GCNTrackers)
     DAG.RegionLiveOuts.buildLiveRegMap();
-  REMAT_DEBUG(
-      dbgs() << "Retrying function scheduling with new min. occupancy of "
-             << AchievedOcc << " from rematerializing (original was "
-             << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
+  REMAT_DEBUG({
+    dbgs() << "Retrying function scheduling with new min. occupancy of "
+           << AchievedOcc << " from rematerializing (original was "
+           << DAG.MinOccupancy;
+    if (TargetOcc)
+      dbgs() << ", target was " << *TargetOcc;
+    dbgs() << ")\n";
+  });
+
   if (AchievedOcc > DAG.MinOccupancy) {
     DAG.MinOccupancy = AchievedOcc;
     SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
 
 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
   return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
-         mayCauseSpilling(WavesAfter) ||
-         (IncreaseOccupancy && WavesAfter < TargetOcc);
+         mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
 }
 
 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
 }
 
 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
-  REMAT_DEBUG({
-    dbgs() << "Collecting rematerializable instructions in ";
-    MF.getFunction().printAsOperand(dbgs(), false);
-    dbgs() << '\n';
-  });
+  const Function &F = MF.getFunction();
 
   // Maps optimizable regions (i.e., regions at minimum and register-limited
   // occupancy, or regions with spilling) to the target RP we would like to
   // reach.
   DenseMap<unsigned, GCNRPTarget> OptRegions;
-  const Function &F = MF.getFunction();
-  unsigned DynamicVGPRBlockSize =
-      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-
-  std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
-  const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
-  const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
-  const unsigned MaxSGPRsIncOcc =
-      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
-  const unsigned MaxVGPRsIncOcc =
-      ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
-  IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
-
-  // Collect optimizable regions. If there is spilling in any region we will
-  // just try to reduce spilling. Otherwise we will try to increase occupancy by
-  // one in the whole function.
-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-    GCNRegPressure &RP = DAG.Pressure[I];
-    // We allow ArchVGPR or AGPR savings to count as savings of the other kind
-    // of VGPR only when trying to eliminate spilling. We cannot do this when
-    // trying to increase occupancy since VGPR class swaps only occur later in
-    // the register allocator i.e., the scheduler will not be able to reason
-    // about these savings and will not report an increase in the achievable
-    // occupancy, triggering rollbacks.
-    GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
-                       /*CombineVGPRSavings=*/true);
-    if (!Target.satisfied() && IncreaseOccupancy) {
-      // There is spilling in the region and we were so far trying to increase
-      // occupancy. Strop trying that and focus on reducing spilling.
-      IncreaseOccupancy = false;
-      OptRegions.clear();
-    } else if (IncreaseOccupancy) {
-      // There is no spilling in the region, try to increase occupancy.
-      Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
-                           /*CombineVGPRSavings=*/false);
+  unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
+  auto ResetTargetRegions = [&]() {
+    OptRegions.clear();
+    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+      const GCNRegPressure &RP = DAG.Pressure[I];
+      GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
+      if (!Target.satisfied())
+        OptRegions.insert({I, Target});
     }
-    if (!Target.satisfied())
-      OptRegions.insert({I, Target});
-  }
-  if (OptRegions.empty())
-    return false;
+  };
 
-#ifndef NDEBUG
-  if (IncreaseOccupancy) {
-    REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
-                       << ") in regions:\n");
+  ResetTargetRegions();
+  if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
+    // In addition to register usage being above addressable limits, occupancy
+    // below the minimum is considered like "spilling" as well.
+    TargetOcc = std::nullopt;
   } else {
-    REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
-                       << WavesPerEU.first << ") in regions:\n");
-  }
-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-    if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
-      REMAT_DEBUG(dbgs() << "  [" << I << "] " << OptIt->getSecond() << '\n');
+    // There is no spilling and room to improve occupancy; set up "increased
+    // occupancy targets" for all regions.
+    TargetOcc = DAG.MinOccupancy + 1;
+    unsigned VGPRBlockSize =
+        MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+    MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
+    MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
+    ResetTargetRegions();
   }
-#endif
-
-  // When we are reducing spilling, the target is the minimum target number of
-  // waves/EU determined by the subtarget. In cases where either one of
-  // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
-  // minimum region occupancy may be higher than the latter.
-  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
-                                : std::max(DAG.MinOccupancy, WavesPerEU.first);
+  REMAT_DEBUG({
+    dbgs() << "Analyzing ";
+    MF.getFunction().printAsOperand(dbgs(), false);
+    dbgs() << ": ";
+    if (OptRegions.empty()) {
+      dbgs() << "no objective to achieve, occupancy is maximal at "
+             << MFI.getMaxWavesPerEU();
+    } else if (!TargetOcc) {
+      dbgs() << "reduce spilling (minimum target occupancy is "
+             << MFI.getMinWavesPerEU() << ')';
+    } else {
+      dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
+             << TargetOcc;
+    }
+    dbgs() << '\n';
+    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+      if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
+        dbgs() << REMAT_PREFIX << "  [" << I << "] " << OptIt->getSecond()
+               << '\n';
+      }
+    }
+  });
+  if (OptRegions.empty())
+    return false;
 
   // Accounts for a reduction in RP in an optimizable region. Returns whether we
   // estimate that we have identified enough rematerialization opportunities to
@@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
                               bool &Progress) -> bool {
     GCNRPTarget &Target = OptIt->getSecond();
-    if (!Target.isSaveBeneficial(Reg, DAG.MRI))
+    if (!Target.isSaveBeneficial(Reg))
       return false;
     Progress = true;
     Target.saveReg(Reg, Mask, DAG.MRI);
@@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
     }
   }
 
-  if (IncreaseOccupancy) {
+  if (TargetOcc) {
     // We were trying to increase occupancy but failed, abort the stage.
     REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
     Rematerializations.clear();
@@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() {
   // All regions impacted by at least one rematerialization must be rescheduled.
   // Maximum pressure must also be recomputed for all regions where it changed
   // non-predictably and checked against the target occupancy.
-  AchievedOcc = TargetOcc;
+  unsigned DynamicVGPRBlockSize =
+      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+  AchievedOcc = MFI.getMaxWavesPerEU();
   for (auto &[I, OriginalRP] : ImpactedRegions) {
     bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
     RescheduleRegions[I] = !IsEmptyRegion;
@@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() {
       }
     }
     DAG.Pressure[I] = RP;
-    AchievedOcc = std::min(
-        AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
-                                             ->getDynamicVGPRBlockSize()));
+    AchievedOcc =
+        std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
   }
   REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
 }
@@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
   // which case we do not want to rollback either (the rescheduling was already
   // reverted in PreRARematStage::shouldRevertScheduling in such cases).
   unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
-  if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
+  if (!TargetOcc || MaxOcc >= *TargetOcc)
     return;
 
   REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 32139a9..790370f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -470,15 +470,12 @@ private:
   /// After successful stage initialization, indicates which regions should be
   /// rescheduled.
   BitVector RescheduleRegions;
-  /// Target occupancy the stage estimates is reachable through
-  /// rematerialization. Greater than or equal to the pre-stage min occupancy.
-  unsigned TargetOcc;
+  /// The target occupancy the stage is trying to achieve. Empty when the
+  /// objective is spilling reduction.
+  std::optional<unsigned> TargetOcc;
   /// Achieved occupancy *only* through rematerializations (pre-rescheduling).
   /// Smaller than or equal to the target occupancy.
   unsigned AchievedOcc;
-  /// Whether the stage is attempting to increase occupancy in the abscence of
-  /// spilling.
-  bool IncreaseOccupancy;
 
   /// Returns whether remat can reduce spilling or increase function occupancy
   /// by 1 through rematerialization. If it can do one, collects instructions in
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f47ddf5..2a8385d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -390,7 +390,11 @@ public:
   /// the original value.
   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
 
-  bool supportsWGP() const { return getGeneration() >= GFX10; }
+  bool supportsWGP() const {
+    if (GFX1250Insts)
+      return false;
+    return getGeneration() >= GFX10;
+  }
 
   bool hasIntClamp() const {
     return HasIntClamp;
@@ -1341,6 +1345,10 @@ public:
 
   bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
 
+  bool setRegModeNeedsVNOPs() const {
+    return GFX1250Insts && getGeneration() == GFX12;
+  }
+
   /// Return if operations acting on VGPR tuples require even alignment.
   bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
 
@@ -1573,6 +1581,12 @@ public:
   // extended VA to 57 bits.
   bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
 
+  // \returns true if the target needs to create a prolog for backward
+  // compatibility when preloading kernel arguments.
+  bool needsKernArgPreloadProlog() const {
+    return hasKernargPreload() && !GFX1250Insts;
+  }
+
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
@@ -1722,6 +1736,10 @@ public:
   /// unit requirement.
   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 
+  bool supportsWave32() const { return getGeneration() >= GFX10; }
+
+  bool supportsWave64() const { return !hasGFX1250Insts(); }
+
   bool isWave32() const {
     return getWavefrontSize() == 32;
   }
@@ -1785,11 +1803,11 @@ public:
 
   // \returns true if the subtarget has a hazard requiring an "s_nop 0"
   // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
-  bool requiresNopBeforeDeallocVGPRs() const {
-    // Currently all targets that support the dealloc VGPRs message also require
-    // the nop.
-    return true;
-  }
+  bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }
+
+  // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
+  // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
+  bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }
 
   bool isDynamicVGPREnabled() const { return DynamicVGPR; }
   unsigned getDynamicVGPRBlockSize() const {
@@ -1801,6 +1819,18 @@ public:
     // to the same register.
     return false;
   }
+
+  // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
+  // and surronded by S_WAIT_ALU(0xFFE3).
+  bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
+    return getGeneration() == GFX12;
+  }
+
+  // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
+  // read.
+  bool hasScratchBaseForwardingHazard() const {
+    return GFX1250Insts && getGeneration() == GFX12;
+  }
 };
 
 class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ee8683a..aafbdc2 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -976,8 +976,10 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
   unsigned Imm = MI->getOperand(OpNo).getImm();
   const MCInstrDesc &Desc = MII.get(MI->getOpcode());
 
-  if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) {
-    O << " /* DP ALU dpp only supports row_newbcast */";
+  if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) &&
+      AMDGPU::isDPALU_DPP(Desc, STI)) {
+    O << " /* DP ALU dpp only supports "
+      << (isGFX12(STI) ? "row_share" : "row_newbcast") << " */";
     return;
   }
   if (Imm <= DppCtrl::QUAD_PERM_LAST) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f358084..61f6732 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -389,6 +389,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
        Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
       // Matrix B format operand reuses op_sel_hi.
       !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) &&
+      // Matrix B scale operand reuses op_sel_hi.
+      !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_scale) &&
       // Matrix B reuse operand reuses op_sel_hi.
       !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
     Encoding |= getImplicitOpSelHiEncoding(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 68302f0..197de12 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -26,7 +26,6 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/TargetParser/TargetParser.h"
@@ -277,10 +276,10 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
 
 void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
     const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
-    const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
-    const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
-    const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
-    const MCSymbol *HasIndirectCall) {
+    const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+    const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+    const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+    const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {
 #define PRINT_RES_INFO(ARG)                                                    \
   OS << "\t.set ";                                                             \
   ARG->print(OS, getContext().getAsmInfo());                                   \
@@ -291,6 +290,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
   PRINT_RES_INFO(NumVGPR);
   PRINT_RES_INFO(NumAGPR);
   PRINT_RES_INFO(NumExplicitSGPR);
+  PRINT_RES_INFO(NumNamedBarrier);
   PRINT_RES_INFO(PrivateSegmentSize);
   PRINT_RES_INFO(UsesVCC);
   PRINT_RES_INFO(UsesFlatScratch);
@@ -563,11 +563,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     PrintField(KD.compute_pgm_rsrc3,
                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
-  if (IVersion.Major >= 10) {
+  if (AMDGPU::supportsWGP(STI))
     PrintField(KD.compute_pgm_rsrc1,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
                ".amdhsa_workgroup_processor_mode");
+  if (IVersion.Major >= 10) {
     PrintField(KD.compute_pgm_rsrc1,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,
@@ -885,7 +886,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
   if (!SymbolELF->isBindingSet())
     SymbolELF->setBinding(ELF::STB_GLOBAL);
 
-  if (SymbolELF->declareCommon(Size, Alignment, true)) {
+  if (SymbolELF->declareCommon(Size, Alignment)) {
     report_fatal_error("Symbol: " + Symbol->getName() +
                        " redeclared as different type");
   }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 9c49020..22afcde 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -62,10 +62,10 @@ public:
 
   virtual void EmitMCResourceInfo(
       const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
-      const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
-      const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
-      const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
-      const MCSymbol *HasIndirectCall) {};
+      const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+      const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+      const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+      const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {};
 
   virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
                                       const MCSymbol *MaxAGPR,
@@ -141,14 +141,12 @@ public:
 
   void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
 
-  void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
-                          const MCSymbol *NumExplicitSGPR,
-                          const MCSymbol *PrivateSegmentSize,
-                          const MCSymbol *UsesVCC,
-                          const MCSymbol *UsesFlatScratch,
-                          const MCSymbol *HasDynamicallySizedStack,
-                          const MCSymbol *HasRecursion,
-                          const MCSymbol *HasIndirectCall) override;
+  void EmitMCResourceInfo(
+      const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+      const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+      const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+      const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+      const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override;
 
   void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
                               const MCSymbol *MaxSGPR) override;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 2d0102f..7c01903 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -197,7 +197,7 @@ enum ClassFlags : unsigned {
 
 namespace AMDGPU {
 enum OperandType : unsigned {
-  /// Operands with register or 32-bit immediate
+  /// Operands with register, 32-bit, or 64-bit immediate
   OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
   OPERAND_REG_IMM_INT64,
   OPERAND_REG_IMM_INT16,
@@ -407,7 +407,7 @@ enum CPol {
 
   SCAL = 1 << 11, // Scale offset bit
 
-  ALL = TH | SCOPE,
+  ALL = TH | SCOPE | NV,
 
   // Helper bits
   TH_TYPE_LOAD = 1 << 7,    // TH_LOAD policy
@@ -440,6 +440,7 @@ enum Id { // Message ID, width(4) [3:0].
   ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
   ID_GS_ALLOC_REQ = 9,       // added in GFX9
   ID_GET_DOORBELL = 10,      // added in GFX9, removed in GFX11
+  ID_SAVEWAVE_HAS_TDM = 10,  // added in GFX1250
   ID_GET_DDID = 11,          // added in GFX10, removed in GFX11
   ID_SYSMSG = 15,
 
@@ -513,6 +514,7 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_HW_ID2 = 24,
   ID_POPS_PACKER = 25,
   ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
+  ID_IB_STS2 = 28,
   ID_SHADER_CYCLES = 29,
   ID_SHADER_CYCLES_HI = 30,
   ID_DVGPR_ALLOC_LO = 31,
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f018f77..dce4e6f 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -460,7 +460,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
   // List of clobbering instructions.
   SmallVector<MachineInstr*, 8> Clobbers;
   // List of instructions marked for deletion.
-  SmallSet<MachineInstr*, 8> MergedInstrs;
+  SmallPtrSet<MachineInstr *, 8> MergedInstrs;
 
   bool Changed = false;
 
@@ -808,7 +808,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
 void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   bool AllAGPRUses = true;
   SetVector<const MachineInstr *> worklist;
-  SmallSet<const MachineInstr *, 4> Visited;
+  SmallPtrSet<const MachineInstr *, 4> Visited;
   SetVector<MachineInstr *> PHIOperands;
   worklist.insert(&MI);
   Visited.insert(&MI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b327fb..561019b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (!IsKernel) {
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+
+    // This assumes the registers are allocated by CCInfo in ascending order
+    // with no gaps.
+    Info->setNumWaveDispatchSGPRs(
+        CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+    Info->setNumWaveDispatchVGPRs(
+        CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+  } else if (Info->getNumKernargPreloadedSGPRs()) {
+    Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
   }
 
   SmallVector<SDValue, 16> Chains;
@@ -6612,7 +6621,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   unsigned SplitSize = 32;
   if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
       ST->hasDPALU_DPP() &&
-      AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
+      AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
     SplitSize = 64;
 
   auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
@@ -10816,6 +10825,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
                                       Op->getOperand(2), Chain),
                    0);
+  case Intrinsic::amdgcn_s_barrier_init:
   case Intrinsic::amdgcn_s_barrier_signal_var: {
     // these two intrinsics have two operands: barrier pointer and member count
     SDValue Chain = Op->getOperand(0);
@@ -10823,6 +10833,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     SDValue BarOp = Op->getOperand(2);
     SDValue CntOp = Op->getOperand(3);
     SDValue M0Val;
+    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
+                       ? AMDGPU::S_BARRIER_INIT_M0
+                       : AMDGPU::S_BARRIER_SIGNAL_M0;
     // extract the BarrierID from bits 4-9 of BarOp
     SDValue BarID;
     BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
@@ -10846,8 +10859,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
 
-    auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL,
-                                     Op->getVTList(), Ops);
+    auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+    return SDValue(NewMI, 0);
+  }
+  case Intrinsic::amdgcn_s_barrier_join: {
+    // these three intrinsics have one operand: barrier pointer
+    SDValue Chain = Op->getOperand(0);
+    SmallVector<SDValue, 2> Ops;
+    SDValue BarOp = Op->getOperand(2);
+    unsigned Opc;
+
+    if (isa<ConstantSDNode>(BarOp)) {
+      uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
+      Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+
+      // extract the BarrierID from bits 4-9 of the immediate
+      unsigned BarID = (BarVal >> 4) & 0x3F;
+      SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
+      Ops.push_back(K);
+      Ops.push_back(Chain);
+    } else {
+      Opc = AMDGPU::S_BARRIER_JOIN_M0;
+
+      // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
+      SDValue M0Val;
+      M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
+                          DAG.getShiftAmountConstant(4, MVT::i32, DL));
+      M0Val =
+          SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
+                                     DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+                  0);
+      Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
+    }
+
+    auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
     return SDValue(NewMI, 0);
   }
   case Intrinsic::amdgcn_s_prefetch_data: {
@@ -11495,9 +11540,22 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
     return FastLowered;
 
   SDLoc SL(Op);
+  EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
 
+  SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
+  SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
+
+  if (VT == MVT::bf16) {
+    SDValue ExtDiv =
+        DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
+    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
+                       DAG.getTargetConstant(0, SL, MVT::i32));
+  }
+
+  assert(VT == MVT::f16);
+
   // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
   // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
   // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
@@ -11514,9 +11572,6 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
   // We will use ISD::FMA on targets that don't support ISD::FMAD.
   unsigned FMADOpCode =
       isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
-
-  SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
-  SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
   SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
   SDValue Rcp =
       DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
@@ -15684,7 +15739,7 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   SDLoc SL(N);
   EVT VT = N->getValueType(0);
-  if (VT != MVT::f16 || !Subtarget->has16BitInsts())
+  if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
     return SDValue();
 
   SDValue LHS = N->getOperand(0);
@@ -16849,6 +16904,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
 
   const TargetRegisterClass *RC = nullptr;
   if (Constraint.size() == 1) {
+    // Check if we cannot determine the bit size of the given value type.  This
+    // can happen, for example, in this situation where we have an empty struct
+    // (size 0): `call void asm "", "v"({} poison)`-
+    if (VT == MVT::Other)
+      return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
     const unsigned BitWidth = VT.getSizeInBits();
     switch (Constraint[0]) {
     default:
@@ -16897,13 +16957,26 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
       }
       break;
     }
-    // We actually support i128, i16 and f16 as inline parameters
-    // even if they are not reported as legal
-    if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
-               VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
-      return std::pair(0U, RC);
+  } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
+    const unsigned BitWidth = VT.getSizeInBits();
+    switch (BitWidth) {
+    case 16:
+      RC = &AMDGPU::AV_32RegClass;
+      break;
+    default:
+      RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
+      if (!RC)
+        return std::pair(0U, nullptr);
+      break;
+    }
   }
 
+  // We actually support i128, i16 and f16 as inline parameters
+  // even if they are not reported as legal
+  if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
+             VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
+    return std::pair(0U, RC);
+
   auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
   if (Kind != '\0') {
     if (Kind == 'v') {
@@ -16916,7 +16989,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
 
     if (RC) {
       if (NumRegs > 1) {
-        if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 > RC->getNumRegs())
+        if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
           return std::pair(0U, nullptr);
 
         uint32_t Width = NumRegs * 32;
@@ -16988,6 +17061,9 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
     case 'a':
       return C_RegisterClass;
     }
+  } else if (Constraint.size() == 2) {
+    if (Constraint == "VA")
+      return C_RegisterClass;
   }
   if (isImmConstraint(Constraint)) {
     return C_Other;
@@ -17727,23 +17803,9 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
 
 /// Return if a flat address space atomicrmw can access private memory.
 static bool flatInstrMayAccessPrivate(const Instruction *I) {
-  const MDNode *NoaliasAddrSpaceMD =
-      I->getMetadata(LLVMContext::MD_noalias_addrspace);
-  if (!NoaliasAddrSpaceMD)
-    return true;
-
-  for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
-       ++I) {
-    auto *Low = mdconst::extract<ConstantInt>(
-        NoaliasAddrSpaceMD->getOperand(2 * I + 0));
-    if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
-      auto *High = mdconst::extract<ConstantInt>(
-          NoaliasAddrSpaceMD->getOperand(2 * I + 1));
-      return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
-    }
-  }
-
-  return true;
+  const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
+  return !MD ||
+         !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
 }
 
 TargetLowering::AtomicExpansionKind
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 4b48fc4..343e455 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2341,6 +2341,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     case AMDGPU::S_MEMREALTIME:
     case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
     case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
+    case AMDGPU::S_BARRIER_LEAVE:
     case AMDGPU::S_GET_BARRIER_STATE_M0:
     case AMDGPU::S_GET_BARRIER_STATE_IMM:
       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 89d9b0d..50964a9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -473,6 +473,7 @@ class VIMAGE_VSAMPLE_Common <bits<8> op> : Enc96 {
   let Inst{4} = r128;
   let Inst{5} = d16;
   let Inst{6} = a16;
+  let Inst{7} = cpol{5}; // nv
   let Inst{21-14} = op;
   let Inst{25-22} = dmask;
   let Inst{39-32} = vdata;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 19e6bcf..cc4bee0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2616,9 +2616,9 @@ std::pair<MachineInstr*, MachineInstr*>
 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
   assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
 
-  if (ST.hasMovB64() &&
+  if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
       AMDGPU::isLegalDPALU_DPPControl(
-        getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
+          ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
     MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
     return std::pair(&MI, nullptr);
   }
@@ -2905,7 +2905,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
                                        MachineBasicBlock &RestoreBB,
                                        const DebugLoc &DL, int64_t BrOffset,
                                        RegScavenger *RS) const {
-  assert(RS && "RegScavenger required for long branching");
   assert(MBB.empty() &&
          "new block should be inserted for expanding unconditional branch");
   assert(MBB.pred_size() == 1);
@@ -4241,6 +4240,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
          MI.getOpcode() == AMDGPU::S_SETPRIO ||
+         MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
          changesVGPRIndexingMode(MI);
 }
 
@@ -4267,12 +4267,15 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
   if (MI.memoperands_empty())
     return true;
 
-  // TODO (?): Does this need to be taught how to read noalias.addrspace ?
-
   // See if any memory operand specifies an address space that involves scratch.
   return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
     unsigned AS = Memop->getAddrSpace();
-    return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+    if (AS == AMDGPUAS::FLAT_ADDRESS) {
+      const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
+      return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
+                        *MD, AMDGPUAS::PRIVATE_ADDRESS);
+    }
+    return AS == AMDGPUAS::PRIVATE_ADDRESS;
   });
 }
 
@@ -5433,7 +5436,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
 
     if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
-        !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) {
+        !AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
+        AMDGPU::isDPALU_DPP(Desc, ST)) {
       ErrInfo = "Invalid dpp_ctrl value: "
                 "DP ALU dpp only support row_newbcast";
       return false;
@@ -9225,7 +9229,7 @@ bool SIInstrInfo::isHighLatencyDef(int Opc) const {
          (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
 }
 
-unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
+Register SIInstrInfo::isStackAccess(const MachineInstr &MI,
                                     int &FrameIndex) const {
   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
   if (!Addr || !Addr->isFI())
@@ -9238,7 +9242,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
 }
 
-unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
+Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
                                         int &FrameIndex) const {
   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
   assert(Addr && Addr->isFI());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 6b9403f..12ffae7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -996,6 +996,11 @@ public:
 
   bool isBarrier(unsigned Opcode) const {
     return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
+           Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
+           Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
+           Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
+           Opcode == AMDGPU::S_BARRIER_LEAVE ||
+           Opcode == AMDGPU::S_BARRIER_LEAVE_IMM ||
            Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER;
   }
 
@@ -1051,7 +1056,7 @@ public:
     }
   }
 
-  bool isWaitcnt(unsigned Opcode) const {
+  static bool isWaitcnt(unsigned Opcode) {
     switch (getNonSoftWaitcntOpcode(Opcode)) {
     case AMDGPU::S_WAITCNT:
     case AMDGPU::S_WAITCNT_VSCNT:
@@ -1402,8 +1407,8 @@ public:
     return get(pseudoToMCOpcode(Opcode));
   }
 
-  unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const;
-  unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+  Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+  Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const;
 
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index c552f1a..c425d97 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1954,6 +1954,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
         !eq(VT, v2f16)  : VCSrc_v2f16,
         !eq(VT, v2bf16) : VCSrc_v2bf16,
         !eq(VT, f32)    : VCSrc_f32,
+        !eq(VT, f64)    : VCSrc_f64,
         !eq(VT, v2i32)  : VCSrc_v2b32,
         1               : VCSrc_b32);
 }
@@ -2707,7 +2708,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
                                isModifierType<Src2VT>.ret,
                                HasOMod);
   field bit HasNeg = HasModifiers;
-  field bit HasMatrixReuse = 0;
   field bit HasMatrixFMT = 0;
   field bit HasMatrixScale = 0;
   field bit HasMatrixReuse = 0;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b49c5a9..e204d6b 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -87,6 +87,8 @@ enum InstClassEnum {
   GLOBAL_STORE_SADDR,
   FLAT_LOAD,
   FLAT_STORE,
+  FLAT_LOAD_SADDR,
+  FLAT_STORE_SADDR,
   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
   GLOBAL_STORE // any CombineInfo, they are only ever returned by
                // getCommonInstClass.
@@ -354,6 +356,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
   case AMDGPU::FLAT_LOAD_DWORD:
   case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+  case AMDGPU::FLAT_STORE_DWORD_SADDR:
     return 1;
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
@@ -367,6 +371,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
   case AMDGPU::FLAT_LOAD_DWORDX2:
   case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
     return 2;
   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
@@ -380,6 +386,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
   case AMDGPU::FLAT_LOAD_DWORDX3:
   case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
     return 3;
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
@@ -393,6 +401,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
   case AMDGPU::FLAT_LOAD_DWORDX4:
   case AMDGPU::FLAT_STORE_DWORDX4:
+  case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
     return 4;
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
@@ -575,6 +585,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
     return GLOBAL_STORE_SADDR;
+  case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+    return FLAT_LOAD_SADDR;
+  case AMDGPU::FLAT_STORE_DWORD_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
+    return FLAT_STORE_SADDR;
   }
 }
 
@@ -661,6 +681,16 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
+  case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+    return AMDGPU::FLAT_LOAD_DWORD_SADDR;
+  case AMDGPU::FLAT_STORE_DWORD_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
+    return AMDGPU::FLAT_STORE_DWORD_SADDR;
   }
 }
 
@@ -776,6 +806,14 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+  case AMDGPU::FLAT_STORE_DWORD_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
+  case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
     Result.SAddr = true;
     [[fallthrough]];
   case AMDGPU::GLOBAL_LOAD_DWORD:
@@ -1875,6 +1913,28 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     case 4:
       return AMDGPU::FLAT_STORE_DWORDX4;
     }
+  case FLAT_LOAD_SADDR:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
+    case 3:
+      return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
+    case 4:
+      return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
+    }
+  case FLAT_STORE_SADDR:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
+    case 3:
+      return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
+    case 4:
+      return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
+    }
   case MIMG:
     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
            "No overlaps");
@@ -2508,12 +2568,14 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     case FLAT_LOAD:
+    case FLAT_LOAD_SADDR:
     case GLOBAL_LOAD:
     case GLOBAL_LOAD_SADDR:
       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     case FLAT_STORE:
+    case FLAT_STORE_SADDR:
     case GLOBAL_STORE:
     case GLOBAL_STORE_SADDR:
       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f8878f3..e97536d 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -57,6 +57,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -76,10 +77,11 @@ private:
   LiveIntervals *LIS = nullptr;
   LiveVariables *LV = nullptr;
   MachineDominatorTree *MDT = nullptr;
+  MachinePostDominatorTree *PDT = nullptr;
   MachineRegisterInfo *MRI = nullptr;
   SetVector<MachineInstr*> LoweredEndCf;
   DenseSet<Register> LoweredIf;
-  SmallSet<MachineBasicBlock *, 4> KillBlocks;
+  SmallPtrSet<MachineBasicBlock *, 4> KillBlocks;
   SmallSet<Register, 8> RecomputeRegs;
 
   const TargetRegisterClass *BoolRC = nullptr;
@@ -138,8 +140,8 @@ private:
 
 public:
   SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV,
-                     MachineDominatorTree *MDT)
-      : LIS(LIS), LV(LV), MDT(MDT) {}
+                     MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
+      : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {}
   bool run(MachineFunction &MF);
 };
 
@@ -159,6 +161,7 @@ public:
     AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
     // Should preserve the same set that TwoAddressInstructions does.
     AU.addPreserved<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
     AU.addPreserved<SlotIndexesWrapperPass>();
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addPreserved<LiveVariablesWrapperPass>();
@@ -457,7 +460,7 @@ MachineBasicBlock::iterator
 SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
   MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
 
-  SmallSet<const MachineBasicBlock *, 4> Visited;
+  SmallPtrSet<const MachineBasicBlock *, 4> Visited;
   MachineBasicBlock *B = &MBB;
   do {
     if (!Visited.insert(B).second)
@@ -506,13 +509,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineBasicBlock *SplitBB = &MBB;
   if (NeedBlockSplit) {
     SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
-    if (MDT && SplitBB != &MBB) {
-      MachineDomTreeNode *MBBNode = (*MDT)[&MBB];
-      SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
-                                                 MBBNode->end());
-      MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB);
-      for (MachineDomTreeNode *Child : Children)
-        MDT->changeImmediateDominator(Child, SplitBBNode);
+    if (SplitBB != &MBB && (MDT || PDT)) {
+      using DomTreeT = DomTreeBase<MachineBasicBlock>;
+      SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+      for (MachineBasicBlock *Succ : SplitBB->successors()) {
+        DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+        DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+      }
+      DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+      if (MDT)
+        MDT->applyUpdates(DTUpdates);
+      if (PDT)
+        PDT->applyUpdates(DTUpdates);
     }
     Opcode = OrTermrOpc;
     InsPt = MI;
@@ -727,26 +735,27 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
   MachineBasicBlock *Succ = *MBB.succ_begin();
   MachineBasicBlock *FallThrough = nullptr;
 
+  using DomTreeT = DomTreeBase<MachineBasicBlock>;
+  SmallVector<DomTreeT::UpdateType, 8> DTUpdates;
+
   while (!MBB.predecessors().empty()) {
     MachineBasicBlock *P = *MBB.pred_begin();
     if (P->getFallThrough(false) == &MBB)
       FallThrough = P;
     P->ReplaceUsesOfBlockWith(&MBB, Succ);
+    DTUpdates.push_back({DomTreeT::Insert, P, Succ});
+    DTUpdates.push_back({DomTreeT::Delete, P, &MBB});
   }
   MBB.removeSuccessor(Succ);
   if (LIS) {
     for (auto &I : MBB.instrs())
       LIS->RemoveMachineInstrFromMaps(I);
   }
-  if (MDT) {
-    // If Succ, the single successor of MBB, is dominated by MBB, MDT needs
-    // updating by changing Succ's idom to the one of MBB; otherwise, MBB must
-    // be a leaf node in MDT and could be erased directly.
-    if (MDT->dominates(&MBB, Succ))
-      MDT->changeImmediateDominator(MDT->getNode(Succ),
-                                    MDT->getNode(&MBB)->getIDom());
-    MDT->eraseNode(&MBB);
-  }
+  if (MDT)
+    MDT->applyUpdates(DTUpdates);
+  if (PDT)
+    PDT->applyUpdates(DTUpdates);
+
   MBB.clear();
   MBB.eraseFromParent();
   if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
@@ -875,7 +884,11 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
   LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
   MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
-  return SILowerControlFlow(LIS, LV, MDT).run(MF);
+  auto *PDTWrapper =
+      getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
+  MachinePostDominatorTree *PDT =
+      PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
+  return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
 }
 
 PreservedAnalyses
@@ -885,13 +898,16 @@ SILowerControlFlowPass::run(MachineFunction &MF,
   LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF);
   MachineDominatorTree *MDT =
       MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
+  MachinePostDominatorTree *PDT =
+      MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
 
-  bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF);
+  bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
   if (!Changed)
     return PreservedAnalyses::all();
 
   auto PA = getMachineFunctionPassPreservedAnalyses();
   PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<MachinePostDominatorTreeAnalysis>();
   PA.preserve<SlotIndexesAnalysis>();
   PA.preserve<LiveIntervalsAnalysis>();
   PA.preserve<LiveVariablesAnalysis>();
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 9509199..09b737c 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -209,10 +209,13 @@ void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) {
   // So set the save points for those.
 
   // Use the points found by shrink-wrapping, if any.
-  if (MFI.getSavePoint()) {
-    SaveBlocks.push_back(MFI.getSavePoint());
-    assert(MFI.getRestorePoint() && "Both restore and save must be set");
-    MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+  if (!MFI.getSavePoints().empty()) {
+    assert(MFI.getSavePoints().size() == 1 &&
+           "Multiple save points not yet supported!");
+    SaveBlocks.push_back(MFI.getSavePoints().front());
+    assert(MFI.getRestorePoints().size() == 1 &&
+           "Multiple restore points not yet supported!");
+    MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front();
     // If RestoreBlock does not have any successor and is not a return block
     // then the end point is unreachable and we do not need to insert any
     // epilogue.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9a1448f..8a11203 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 // optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
 // where it is better to produce the VGPR form (e.g. if there are VGPR users
 // of the MFMA result).
-cl::opt<bool> MFMAVGPRForm(
+static cl::opt<bool> MFMAVGPRForm(
     "amdgpu-mfma-vgpr-form", cl::Hidden,
     cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
              "unspecified, default to compiler heuristics"),
@@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
       HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
       HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
+      NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
+      NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
       HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
       Occupancy(MFI.getOccupancy()),
       ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
@@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   WaveLimiter = YamlMFI.WaveLimiter;
   HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
   HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+  NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
+  NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
   BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
   ReturnsVoid = YamlMFI.ReturnsVoid;
   IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 08b0206..ca8f803 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   bool WaveLimiter = false;
   bool HasSpilledSGPRs = false;
   bool HasSpilledVGPRs = false;
+  uint16_t NumWaveDispatchSGPRs = 0;
+  uint16_t NumWaveDispatchVGPRs = 0;
   uint32_t HighBitsOf32BitAddress = 0;
 
   // TODO: 10 may be a better default since it's the maximum.
@@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
     YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
     YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
+    YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false);
+    YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false);
     YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
                        StringValue("$private_rsrc_reg"));
     YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -465,6 +469,9 @@ private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
 
+  unsigned NumWaveDispatchSGPRs = 0;
+  unsigned NumWaveDispatchVGPRs = 0;
+
   bool HasSpilledSGPRs = false;
   bool HasSpilledVGPRs = false;
   bool HasNonSpillStackObjects = false;
@@ -991,6 +998,14 @@ public:
     return UserSGPRInfo.getNumKernargPreloadSGPRs();
   }
 
+  unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; }
+
+  void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; }
+
+  unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; }
+
+  void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
+
   Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
     return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
   }
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 205a45a..38d9a4b 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -130,6 +130,9 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
         if (VirtReg.isPhysical())
           continue;
 
+        if (!VirtReg.isValid())
+          continue;
+
         if (!VRM->hasPhys(VirtReg))
           continue;
 
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 5940f45..93ba0a3 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -73,6 +73,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
 
   NumSGPRsForWavesPerEU = ZeroExpr;
   NumVGPRsForWavesPerEU = ZeroExpr;
+  NamedBarCnt = ZeroExpr;
   Occupancy = ZeroExpr;
   DynamicCallStack = ZeroExpr;
   VCCUsed = ZeroExpr;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 79099d2..171c4a3 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -83,6 +83,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
   // Number of VGPRs that meets number of waves per execution unit request.
   const MCExpr *NumVGPRsForWavesPerEU = nullptr;
 
+  // Number of named barriers used by the kernel.
+  const MCExpr *NamedBarCnt = nullptr;
+
   // Final occupancy.
   const MCExpr *Occupancy = nullptr;
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 81655f5..0293d40 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1166,7 +1166,8 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName>
 }
 
 //===----------------------------------------------------------------------===//
-//  SSrc_* Operands with an SGPR or a 32-bit immediate
+//  SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate
+//  if supported by target.
 //===----------------------------------------------------------------------===//
 
 class SrcRegOrImm9<RegisterClass regClass, string operandType>
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 431d73b..a003a46 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -484,6 +484,24 @@ def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (o
   let isConvergent = 1;
 }
 
+def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins),
+  "", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+}
+
+def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs),
+  (ins SplitBarrier:$src0), "$src0", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+}
+
+def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
+  "", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+}
+
 } // End Uses = [M0]
 
 def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
@@ -501,6 +519,12 @@ def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (out
   let isConvergent = 1;
 }
 
+def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
+  (ins SplitBarrier:$src0), "$src0", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+}
+
 } // End has_sdst = 0
 
 def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
@@ -1588,6 +1612,17 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
   let isConvergent = 1;
 }
 
+def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> {
+  let SchedRW = [WriteBarrier];
+  let simm16 = 0;
+  let fixed_imm = 1;
+  let isConvergent = 1;
+  let Defs = [SCC];
+}
+
+def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave",
+    (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>;
+
 def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
   let SubtargetPredicate = isGFX8Plus;
   let simm16 = 0;
@@ -1630,7 +1665,9 @@ def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
 
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
     [(int_amdgcn_s_sethalt timm:$simm16)]>;
-def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
+def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {
+  let SubtargetPredicate = isNotGFX1250Plus;
+}
 
 // On SI the documentation says sleep for approximately 64 * low 2
 // bits, consistent with the reported maximum of 448. On VI the
@@ -2144,9 +2181,13 @@ defm S_SENDMSG_RTN_B64            : SOP1_Real_gfx11_gfx12<0x04d>;
 defm S_BARRIER_SIGNAL_M0          : SOP1_M0_Real_gfx12<0x04e>;
 defm S_BARRIER_SIGNAL_ISFIRST_M0  : SOP1_M0_Real_gfx12<0x04f>;
 defm S_GET_BARRIER_STATE_M0       : SOP1_M0_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_M0            : SOP1_M0_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_M0            : SOP1_M0_Real_gfx12<0x052>;
 defm S_BARRIER_SIGNAL_IMM         : SOP1_IMM_Real_gfx12<0x04e>;
 defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>;
 defm S_GET_BARRIER_STATE_IMM      : SOP1_IMM_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_IMM           : SOP1_IMM_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_IMM           : SOP1_IMM_Real_gfx12<0x052>;
 defm S_ALLOC_VGPR                 : SOP1_Real_gfx12<0x053>;
 defm S_SLEEP_VAR                  : SOP1_IMM_Real_gfx12<0x058>;
 
@@ -2639,6 +2680,7 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> {
 }
 
 defm S_BARRIER_WAIT         : SOPP_Real_32_gfx12<0x014>;
+defm S_BARRIER_LEAVE        : SOPP_Real_32_gfx12<0x015>;
 defm S_WAIT_LOADCNT         : SOPP_Real_32_gfx12<0x040>;
 defm S_WAIT_STORECNT        : SOPP_Real_32_gfx12<0x041>;
 defm S_WAIT_SAMPLECNT       : SOPP_Real_32_gfx12<0x042>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 3d9455f..c740b5e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -106,7 +106,7 @@ static constexpr CustomOperand MsgOperands[] = {
   {{"MSG_GET_DDID"},            ID_GET_DDID,                isGFX10},
   {{"MSG_HS_TESSFACTOR"},       ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus},
   {{"MSG_DEALLOC_VGPRS"},       ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus},
-  {{""}},
+  {{"MSG_SAVEWAVE_HAS_TDM"},    ID_SAVEWAVE_HAS_TDM,        isGFX1250},
   {{"MSG_SYSMSG"},              ID_SYSMSG},
   {{"MSG_RTN_GET_DOORBELL"},    ID_RTN_GET_DOORBELL,        isGFX11Plus},
   {{"MSG_RTN_GET_DDID"},        ID_RTN_GET_DDID,            isGFX11Plus},
@@ -195,7 +195,7 @@ static constexpr CustomOperand Operands[] = {
   {{"HW_REG_POPS_PACKER"},   ID_POPS_PACKER, isGFX10},
   {{""}},
   {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11},
-  {{""}},
+  {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250},
   {{"HW_REG_SHADER_CYCLES"},    ID_SHADER_CYCLES,    isGFX10_3_GFX11},
   {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus},
   {{"HW_REG_DVGPR_ALLOC_LO"},   ID_DVGPR_ALLOC_LO,   isGFX12Plus},
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 1e3e9a2..6e4e087 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -1160,17 +1161,28 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
     return 65536;
   if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
     return 163840;
+  if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+    return 327680;
   return 0;
 }
 
 unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
   // "Per CU" really means "per whatever functional block the waves of a
-  // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+  // workgroup must share".
+
+  // GFX12.5 only supports CU mode, which contains four SIMDs.
+  if (isGFX1250(*STI)) {
+    assert(STI->getFeatureBits().test(FeatureCuMode));
+    return 4;
+  }
+
+  // For gfx10 in CU mode the functional block is the CU, which contains
   // two SIMDs.
   if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
     return 2;
-  // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
-  // two CUs, so a total of four SIMDs.
+
+  // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP
+  // contains two CUs, so a total of four SIMDs.
   return 4;
 }
 
@@ -1666,6 +1678,29 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size) {
   return Vals;
 }
 
+bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
+  assert((MD.getNumOperands() % 2 == 0) && "invalid number of operands!");
+  for (unsigned I = 0, E = MD.getNumOperands() / 2; I != E; ++I) {
+    auto Low =
+        mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 0))->getValue();
+    auto High =
+        mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 1))->getValue();
+    // There are two types of [A; B) ranges:
+    //  A < B, e.g. [4; 5) which is a range that only includes 4.
+    //  A > B, e.g. [5; 4) which is a range that wraps around and includes
+    //         everything except 4.
+    if (Low.ult(High)) {
+      if (Low.ule(Val) && High.ugt(Val))
+        return true;
+    } else {
+      if (Low.uge(Val) && High.ult(Val))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
@@ -2406,7 +2441,11 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) {
   return 0;
 }
 
-unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; }
+unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) {
+  if (isGFX1250(STI))
+    return 32;
+  return 16;
+}
 
 bool isSI(const MCSubtargetInfo &STI) {
   return STI.hasFeature(AMDGPU::FeatureSouthernIslands);
@@ -2478,6 +2517,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
 }
 
+bool supportsWGP(const MCSubtargetInfo &STI) {
+  if (isGFX1250(STI))
+    return false;
+  return isGFX10Plus(STI);
+}
+
 bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); }
 
 bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
@@ -3309,13 +3354,39 @@ bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
   return false;
 }
 
-bool isDPALU_DPP(const MCInstrDesc &OpDesc) {
+bool isDPALU_DPP32BitOpc(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::V_MUL_LO_U32_e64:
+  case AMDGPU::V_MUL_LO_U32_e64_dpp:
+  case AMDGPU::V_MUL_LO_U32_e64_dpp_gfx1250:
+  case AMDGPU::V_MUL_HI_U32_e64:
+  case AMDGPU::V_MUL_HI_U32_e64_dpp:
+  case AMDGPU::V_MUL_HI_U32_e64_dpp_gfx1250:
+  case AMDGPU::V_MUL_HI_I32_e64:
+  case AMDGPU::V_MUL_HI_I32_e64_dpp:
+  case AMDGPU::V_MUL_HI_I32_e64_dpp_gfx1250:
+  case AMDGPU::V_MAD_U32_e64:
+  case AMDGPU::V_MAD_U32_e64_dpp:
+  case AMDGPU::V_MAD_U32_e64_dpp_gfx1250:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
+  if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP))
+    return false;
+
+  if (isDPALU_DPP32BitOpc(OpDesc.getOpcode()))
+    return ST.hasFeature(AMDGPU::FeatureGFX1250Insts);
+
   return hasAny64BitVGPROperands(OpDesc);
 }
 
 unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
-  // Currently this is 128 for all subtargets
-  return 128;
+  return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
+                                                                        : 128;
 }
 
 bool isPackedFP32Inst(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1bcd36c..70dfb63 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -35,6 +35,7 @@ class MCInstrInfo;
 class MCRegisterClass;
 class MCRegisterInfo;
 class MCSubtargetInfo;
+class MDNode;
 class StringRef;
 class Triple;
 class raw_ostream;
@@ -1064,6 +1065,9 @@ SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
 std::optional<SmallVector<unsigned>>
 getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
 
+/// Checks if \p Val is inside \p MD, a !range-like metadata.
+bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
+
 /// Represents the counter values to wait for in an s_waitcnt instruction.
 ///
 /// Large values (including the maximum possible integer) can be used to
@@ -1549,6 +1553,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
 bool isGFX12(const MCSubtargetInfo &STI);
 bool isGFX12Plus(const MCSubtargetInfo &STI);
 bool isGFX1250(const MCSubtargetInfo &STI);
+bool supportsWGP(const MCSubtargetInfo &STI);
 bool isNotGFX12Plus(const MCSubtargetInfo &STI);
 bool isNotGFX11Plus(const MCSubtargetInfo &STI);
 bool isGCN3Encoding(const MCSubtargetInfo &STI);
@@ -1750,15 +1755,22 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST);
 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 
 LLVM_READNONE
-inline bool isLegalDPALU_DPPControl(unsigned DC) {
-  return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) {
+  if (isGFX12(ST))
+    return DC >= DPP::ROW_SHARE_FIRST && DC <= DPP::ROW_SHARE_LAST;
+  if (isGFX90A(ST))
+    return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+  return false;
 }
 
 /// \returns true if an instruction may have a 64-bit VGPR operand.
 bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc);
 
+/// \returns true if an instruction is a DP ALU DPP without any 64-bit operands.
+bool isDPALU_DPP32BitOpc(unsigned Opc);
+
 /// \returns true if an instruction is a DP ALU DPP.
-bool isDPALU_DPP(const MCInstrDesc &OpDesc);
+bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST);
 
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index b128207..11c7275 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -706,7 +706,6 @@ def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>;
 
 let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
     mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
-  // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
   let SubtargetPredicate = isGFX12PlusNot12_50 in
     defm V_CVT_F32_FP8_OP_SEL    : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
   let SubtargetPredicate = isGFX125xOnly in
@@ -731,7 +730,6 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe
 >;
 
 let OtherPredicates = [HasFP8ConversionInsts] in {
-  // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
   let SubtargetPredicate = isGFX12PlusNot12_50 in
     def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
   let SubtargetPredicate = isGFX125xOnly in {
@@ -740,7 +738,6 @@ let OtherPredicates = [HasFP8ConversionInsts] in {
     def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel),
                  (V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>;
   }
-  // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
   let SubtargetPredicate = isGFX12Plus in
     def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
 }
@@ -1058,11 +1055,6 @@ multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> :
 multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> :
   VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>;
 
-multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
-                                             string opName, string asmName> :
-  VOP1_Real_e32_with_name<Gen, op, opName, asmName>,
-  VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
-
 multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
     bits<9> op, string asmName = !tolower(NAME), string opName = NAME> {
   defm opName#"_t16" :
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index f4b6af6..329d003 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -2084,6 +2084,9 @@ multiclass VOP3_Realtriple_gfx11_gfx12<bits<10> op> :
 multiclass VOP3_Real_Base_gfx11_gfx12<bits<10> op> :
   VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Gen, op>;
 
+multiclass VOP3_Real_Base_gfx11_gfx12_not_gfx1250<bits<10> op> :
+  VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Not12_50Gen, op>;
+
 multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
                                                  string asmName> :
   VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
@@ -2211,9 +2214,9 @@ defm V_MUL_F64             : VOP3_Real_Base_gfx11<0x328>;
 defm V_MIN_F64             : VOP3_Real_Base_gfx11<0x329>;
 defm V_MAX_F64             : VOP3_Real_Base_gfx11<0x32a>;
 defm V_LDEXP_F64           : VOP3_Real_Base_gfx11_gfx12<0x32b>;
-defm V_MUL_LO_U32          : VOP3_Real_Base_gfx11_gfx12<0x32c>;
-defm V_MUL_HI_U32          : VOP3_Real_Base_gfx11_gfx12<0x32d>;
-defm V_MUL_HI_I32          : VOP3_Real_Base_gfx11_gfx12<0x32e>;
+defm V_MUL_LO_U32          : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32c>;
+defm V_MUL_HI_U32          : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32d>;
+defm V_MUL_HI_I32          : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32e>;
 defm V_TRIG_PREOP_F64      : VOP3_Real_Base_gfx11_gfx12<0x32f>;
 defm V_LSHLREV_B16         : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x338, "v_lshlrev_b16">;
 defm V_LSHRREV_B16         : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">;
@@ -2242,6 +2245,10 @@ let AssemblerPredicate = isGFX11Plus in {
 }
 
 // These instructions differ from GFX12 variant by supporting DPP:
+defm V_MUL_LO_U32                    : VOP3Only_Realtriple_gfx1250<0x32c>;
+defm V_MUL_HI_U32                    : VOP3Only_Realtriple_gfx1250<0x32d>;
+defm V_MUL_HI_I32                    : VOP3Only_Realtriple_gfx1250<0x32e>;
+
 defm V_PERM_PK16_B4_U4               : VOP3Only_Real_Base_gfx1250<0x23f>;
 defm V_PERM_PK16_B6_U4               : VOP3Only_Real_Base_gfx1250<0x242>;
 defm V_PERM_PK16_B8_U4               : VOP3Only_Real_Base_gfx1250<0x243>;
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.h b/llvm/lib/Target/ARM/ARMCallingConv.h
index 7c692f0..b6b2d59 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.h
+++ b/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -19,34 +19,35 @@ namespace llvm {
 
 bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                   CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                  CCState &State);
+                  Type *OrigTy, CCState &State);
 bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT,
                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                      CCState &State);
+                      Type *OrigTy, CCState &State);
 bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                  CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                 CCState &State);
+                 Type *OrigTy, CCState &State);
 bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                      CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                     CCState &State);
+                     Type *OrigTy, CCState &State);
 bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                      CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                     CCState &State);
+                     Type *OrigTy, CCState &State);
 bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
                                 CCValAssign::LocInfo LocInfo,
-                                ISD::ArgFlagsTy ArgFlags, CCState &State);
+                                ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                CCState &State);
 bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                      CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                     CCState &State);
+                     Type *OrigTy, CCState &State);
 bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT,
                          CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                         CCState &State);
+                         Type *OrigTy, CCState &State);
 bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                    CCState &State);
+                    Type *OrigTy, CCState &State);
 bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                         CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                        CCState &State);
+                        Type *OrigTy, CCState &State);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index ef69083..c53e215 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -109,7 +109,7 @@ namespace {
 
     /// NewWaterList - The subset of WaterList that was created since the
     /// previous iteration by inserting unconditional branches.
-    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+    SmallPtrSet<MachineBasicBlock *, 4> NewWaterList;
 
     using water_iterator = std::vector<MachineBasicBlock *>::iterator;
 
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 7ba2487..14e1160 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -1943,8 +1943,11 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                                   unsigned &NumBytes,
                                   bool isVarArg) {
   SmallVector<CCValAssign, 16> ArgLocs;
+  SmallVector<Type *, 16> OrigTys;
+  for (Value *Arg : Args)
+    OrigTys.push_back(Arg->getType());
   CCState CCInfo(CC, isVarArg, *FuncInfo.MF, ArgLocs, *Context);
-  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags,
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, OrigTys,
                              CCAssignFnForCall(CC, false, isVarArg));
 
   // Check that we can handle all of the arguments. If we can't, then bail out
@@ -2093,7 +2096,8 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+    CCInfo.AnalyzeCallResult(RetVT, I->getType(),
+                             CCAssignFnForCall(CC, true, isVarArg));
 
     // Copy all of the result registers out of their specified physreg.
     if (RVLocs.size() == 2 && RetVT == MVT::f64) {
@@ -2278,7 +2282,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   if (RetVT != MVT::isVoid && RetVT != MVT::i32) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false));
+    CCInfo.AnalyzeCallResult(RetVT, RetTy, CCAssignFnForCall(CC, true, false));
     if (RVLocs.size() >= 2 && RetVT != MVT::f64)
       return false;
   }
@@ -2389,7 +2393,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
       RetVT != MVT::i16 && RetVT != MVT::i32) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+    CCInfo.AnalyzeCallResult(RetVT, RetTy,
+                             CCAssignFnForCall(CC, true, isVarArg));
     if (RVLocs.size() >= 2 && RetVT != MVT::f64)
       return false;
   }
@@ -2499,6 +2504,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   // Set all unused physreg defs as dead.
   static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
 
+  diagnoseDontCall(*CI);
   return true;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index ea99cc4..8301563 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -587,167 +587,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     }
   }
 
-  // RTLIB
-  if (TM.isAAPCS_ABI() && (TT.isTargetAEABI() || TT.isTargetGNUAEABI() ||
-                           TT.isTargetMuslAEABI() || TT.isAndroid())) {
-    // FIXME: This does not depend on the subtarget and should go directly into
-    // RuntimeLibcalls. This is only here because of missing support for setting
-    // the calling convention of an implementation.
-    // clang-format off
-    static const struct {
-      const RTLIB::Libcall Op;
-      const RTLIB::LibcallImpl Impl;
-    } LibraryCalls[] = {
-      // Double-precision floating-point arithmetic helper functions
-      // RTABI chapter 4.1.2, Table 2
-      { RTLIB::ADD_F64, RTLIB::__aeabi_dadd },
-      { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv },
-      { RTLIB::MUL_F64, RTLIB::__aeabi_dmul },
-      { RTLIB::SUB_F64, RTLIB::__aeabi_dsub },
-
-      // Double-precision floating-point comparison helper functions
-      // RTABI chapter 4.1.2, Table 3
-      { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__oeq },
-      { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__une },
-      { RTLIB::OLT_F64, RTLIB::__aeabi_dcmplt },
-      { RTLIB::OLE_F64, RTLIB::__aeabi_dcmple },
-      { RTLIB::OGE_F64, RTLIB::__aeabi_dcmpge },
-      { RTLIB::OGT_F64, RTLIB::__aeabi_dcmpgt },
-      { RTLIB::UO_F64,  RTLIB::__aeabi_dcmpun },
-
-      // Single-precision floating-point arithmetic helper functions
-      // RTABI chapter 4.1.2, Table 4
-      { RTLIB::ADD_F32, RTLIB::__aeabi_fadd },
-      { RTLIB::DIV_F32, RTLIB::__aeabi_fdiv },
-      { RTLIB::MUL_F32, RTLIB::__aeabi_fmul },
-      { RTLIB::SUB_F32, RTLIB::__aeabi_fsub },
-
-      // Single-precision floating-point comparison helper functions
-      // RTABI chapter 4.1.2, Table 5
-      { RTLIB::OEQ_F32, RTLIB::__aeabi_fcmpeq__oeq },
-      { RTLIB::UNE_F32, RTLIB::__aeabi_fcmpeq__une },
-      { RTLIB::OLT_F32, RTLIB::__aeabi_fcmplt},
-      { RTLIB::OLE_F32, RTLIB::__aeabi_fcmple },
-      { RTLIB::OGE_F32, RTLIB::__aeabi_fcmpge },
-      { RTLIB::OGT_F32, RTLIB::__aeabi_fcmpgt },
-      { RTLIB::UO_F32,  RTLIB::__aeabi_fcmpun },
-
-      // Floating-point to integer conversions.
-      // RTABI chapter 4.1.2, Table 6
-      { RTLIB::FPTOSINT_F64_I32, RTLIB::__aeabi_d2iz },
-      { RTLIB::FPTOUINT_F64_I32, RTLIB::__aeabi_d2uiz },
-      { RTLIB::FPTOSINT_F64_I64, RTLIB::__aeabi_d2lz },
-      { RTLIB::FPTOUINT_F64_I64, RTLIB::__aeabi_d2ulz },
-      { RTLIB::FPTOSINT_F32_I32, RTLIB::__aeabi_f2iz },
-      { RTLIB::FPTOUINT_F32_I32, RTLIB::__aeabi_f2uiz },
-      { RTLIB::FPTOSINT_F32_I64, RTLIB::__aeabi_f2lz },
-      { RTLIB::FPTOUINT_F32_I64, RTLIB::__aeabi_f2ulz },
-
-      // Conversions between floating types.
-      // RTABI chapter 4.1.2, Table 7
-      { RTLIB::FPROUND_F64_F32, RTLIB::__aeabi_d2f },
-      { RTLIB::FPROUND_F64_F16, RTLIB::__aeabi_d2h },
-      { RTLIB::FPEXT_F32_F64,   RTLIB::__aeabi_f2d },
-
-      // Integer to floating-point conversions.
-      // RTABI chapter 4.1.2, Table 8
-      { RTLIB::SINTTOFP_I32_F64, RTLIB::__aeabi_i2d },
-      { RTLIB::UINTTOFP_I32_F64, RTLIB::__aeabi_ui2d },
-      { RTLIB::SINTTOFP_I64_F64, RTLIB::__aeabi_l2d },
-      { RTLIB::UINTTOFP_I64_F64, RTLIB::__aeabi_ul2d },
-      { RTLIB::SINTTOFP_I32_F32, RTLIB::__aeabi_i2f },
-      { RTLIB::UINTTOFP_I32_F32, RTLIB::__aeabi_ui2f },
-      { RTLIB::SINTTOFP_I64_F32, RTLIB::__aeabi_l2f },
-      { RTLIB::UINTTOFP_I64_F32, RTLIB::__aeabi_ul2f },
-
-      // Long long helper functions
-      // RTABI chapter 4.2, Table 9
-      { RTLIB::MUL_I64, RTLIB::__aeabi_lmul },
-      { RTLIB::SHL_I64, RTLIB::__aeabi_llsl },
-      { RTLIB::SRL_I64, RTLIB::__aeabi_llsr },
-      { RTLIB::SRA_I64, RTLIB::__aeabi_lasr },
-
-      // Integer division functions
-      // RTABI chapter 4.3.1
-      { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv },
-      { RTLIB::SDIV_I64, RTLIB::__aeabi_ldivmod },
-      { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv },
-      { RTLIB::UDIV_I64, RTLIB::__aeabi_uldivmod },
-    };
-    // clang-format on
-
-    for (const auto &LC : LibraryCalls)
-      setLibcallImpl(LC.Op, LC.Impl);
-
-    // EABI dependent RTLIB
-    if (TM.Options.EABIVersion == EABI::EABI4 ||
-        TM.Options.EABIVersion == EABI::EABI5) {
-      static const struct {
-        const RTLIB::Libcall Op;
-        const RTLIB::LibcallImpl Impl;
-      } MemOpsLibraryCalls[] = {
-          // Memory operations
-          // RTABI chapter 4.3.4
-          {RTLIB::MEMCPY, RTLIB::__aeabi_memcpy},
-          {RTLIB::MEMMOVE, RTLIB::__aeabi_memmove},
-          {RTLIB::MEMSET, RTLIB::__aeabi_memset},
-          {RTLIB::AEABI_MEMCPY4, RTLIB::__aeabi_memcpy4},
-          {RTLIB::AEABI_MEMCPY8, RTLIB::__aeabi_memcpy8},
-          {RTLIB::AEABI_MEMMOVE4, RTLIB::__aeabi_memmove4},
-          {RTLIB::AEABI_MEMMOVE8, RTLIB::__aeabi_memmove8},
-          {RTLIB::AEABI_MEMSET4, RTLIB::__aeabi_memset4},
-          {RTLIB::AEABI_MEMSET8, RTLIB::__aeabi_memset8},
-          {RTLIB::AEABI_MEMCLR, RTLIB::__aeabi_memclr},
-          {RTLIB::AEABI_MEMCLR4, RTLIB::__aeabi_memclr4},
-          {RTLIB::AEABI_MEMCLR8, RTLIB::__aeabi_memclr8},
-      };
-
-      for (const auto &LC : MemOpsLibraryCalls)
-        setLibcallImpl(LC.Op, LC.Impl);
-    }
-  }
-
-  // The half <-> float conversion functions are always soft-float on
-  // non-watchos platforms, but are needed for some targets which use a
-  // hard-float calling convention by default.
-  if (!TT.isWatchABI()) {
-    if (TM.isAAPCS_ABI()) {
-      setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS);
-      setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS);
-      setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS);
-      setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_AAPCS);
-      setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_AAPCS);
-    } else {
-      setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS);
-      setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS);
-      setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS);
-      setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_APCS);
-      setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_APCS);
-    }
-  }
-
-  // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
-  // a __gnu_ prefix (which is the default).
-  if (TT.isTargetAEABI()) {
-    // FIXME: This does not depend on the subtarget and should go directly into
-    // RuntimeLibcalls. This is only here because of missing support for setting
-    // the calling convention of an implementation.
-    static const struct {
-      const RTLIB::Libcall Op;
-      const RTLIB::LibcallImpl Impl;
-    } LibraryCalls[] = {
-        {RTLIB::FPROUND_F32_F16, RTLIB::__aeabi_f2h},
-        {RTLIB::FPEXT_F16_F32, RTLIB::__aeabi_h2f},
-    };
-
-    for (const auto &LC : LibraryCalls) {
-      setLibcallImpl(LC.Op, LC.Impl);
-    }
-  } else if (!TT.isOSBinFormatMachO()) {
-    setLibcallImpl(RTLIB::FPROUND_F32_F16, RTLIB::__gnu_f2h_ieee);
-    setLibcallImpl(RTLIB::FPEXT_F16_F32, RTLIB::__gnu_h2f_ieee);
-  }
-
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
@@ -802,6 +641,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::BSWAP, VT, Expand);
   }
 
+  if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
+    setOperationAction(ISD::SCMP, MVT::i32, Custom);
+
+  if (!Subtarget->hasV8_1MMainlineOps())
+    setOperationAction(ISD::UCMP, MVT::i32, Custom);
+
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 
@@ -1634,6 +1479,10 @@ bool ARMTargetLowering::useSoftFloat() const {
   return Subtarget->useSoftFloat();
 }
 
+bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
+  return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
+}
+
 // FIXME: It might make sense to define the representative register class as the
 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -3769,10 +3618,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
 
   // call __tls_get_addr.
   ArgListTy Args;
-  ArgListEntry Entry;
-  Entry.Node = Argument;
-  Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
-  Args.push_back(Entry);
+  Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
 
   // FIXME: is there useful debug info available here?
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -7396,7 +7242,7 @@ static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  if (M.size() != NumElts && M.size() != NumElts*2)
+  if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
     return false;
 
   // If the mask is twice as long as the input vector then we need to check the
@@ -7428,7 +7274,7 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  if (M.size() != NumElts && M.size() != NumElts*2)
+  if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -7531,7 +7377,7 @@ static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  if (M.size() != NumElts && M.size() != NumElts*2)
+  if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -7564,7 +7410,7 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  if (M.size() != NumElts && M.size() != NumElts*2)
+  if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -9991,9 +9837,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
     SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
 
-    ArgListEntry Entry;
-    Entry.Node = SRet;
-    Entry.Ty = PointerType::getUnqual(RetTy->getContext());
+    ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
     Entry.IsSExt = false;
     Entry.IsZExt = false;
     Entry.IsSRet = true;
@@ -10001,12 +9845,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
     RetTy = Type::getVoidTy(*DAG.getContext());
   }
 
-  ArgListEntry Entry;
-  Entry.Node = Arg;
-  Entry.Ty = ArgTy;
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
-  Args.push_back(Entry);
+  Args.emplace_back(Arg, ArgTy);
 
   RTLIB::Libcall LC =
       (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
@@ -10059,10 +9898,9 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   ARMTargetLowering::ArgListTy Args;
 
   for (auto AI : {1, 0}) {
-    ArgListEntry Arg;
-    Arg.Node = Op.getOperand(AI);
-    Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Args.push_back(Arg);
+    SDValue Operand = Op.getOperand(AI);
+    Args.emplace_back(Operand,
+                      Operand.getValueType().getTypeForEVT(*DAG.getContext()));
   }
 
   CallLoweringInfo CLI(DAG);
@@ -10612,6 +10450,133 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
   return DAG.getBitcast(MVT::i32, Res);
 }
 
+SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  // Determine if this is signed or unsigned comparison
+  bool IsSigned = (Op.getOpcode() == ISD::SCMP);
+
+  // Special case for Thumb1 UCMP only
+  if (!IsSigned && Subtarget->isThumb1Only()) {
+    // For Thumb unsigned comparison, use this sequence:
+    // subs r2, r0, r1   ; r2 = LHS - RHS, sets flags
+    // sbc r2, r2        ; r2 = r2 - r2 - !carry
+    // cmp r1, r0        ; compare RHS with LHS
+    // sbc r1, r1        ; r1 = r1 - r1 - !carry
+    // subs r0, r2, r1   ; r0 = r2 - r1 (final result)
+
+    // First subtraction: LHS - RHS
+    SDValue Sub1WithFlags = DAG.getNode(
+        ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+    SDValue Sub1Result = Sub1WithFlags.getValue(0);
+    SDValue Flags1 = Sub1WithFlags.getValue(1);
+
+    // SUBE: Sub1Result - Sub1Result - !carry
+    // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
+    SDValue Sbc1 =
+        DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
+                    Sub1Result, Sub1Result, Flags1);
+    SDValue Sbc1Result = Sbc1.getValue(0);
+
+    // Second comparison: RHS vs LHS (reverse comparison)
+    SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
+
+    // SUBE: RHS - RHS - !carry
+    // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
+    SDValue Sbc2 = DAG.getNode(
+        ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
+    SDValue Sbc2Result = Sbc2.getValue(0);
+
+    // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
+    SDValue Result =
+        DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
+    if (Op.getValueType() != MVT::i32)
+      Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
+
+    return Result;
+  }
+
+  // For the ARM assembly pattern:
+  // subs r0, r0, r1   ; subtract RHS from LHS and set flags
+  // movgt r0, #1      ; if LHS > RHS, set result to 1 (GT for signed, HI for
+  // unsigned) mvnlt r0, #0      ; if LHS < RHS, set result to -1 (LT for
+  // signed, LO for unsigned)
+  //                   ; if LHS == RHS, result remains 0 from the subs
+
+  // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
+  unsigned Opcode = ARMISD::SUBC;
+
+  // Check if RHS is a subtraction against 0: (0 - X)
+  if (RHS.getOpcode() == ISD::SUB) {
+    SDValue SubLHS = RHS.getOperand(0);
+    SDValue SubRHS = RHS.getOperand(1);
+
+    // Check if it's 0 - X
+    if (isNullConstant(SubLHS)) {
+      bool CanUseAdd = false;
+      if (IsSigned) {
+        // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
+        if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
+                                                      .getSignedMinValue()
+                                                      .isMinSignedValue()) {
+          CanUseAdd = true;
+        }
+      } else {
+        // For UCMP: only if X is known to never be zero
+        if (DAG.isKnownNeverZero(SubRHS)) {
+          CanUseAdd = true;
+        }
+      }
+
+      if (CanUseAdd) {
+        Opcode = ARMISD::ADDC;
+        RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
+                      // LHS - (0 - X)
+      }
+    }
+  }
+
+  // Generate the operation with flags
+  SDValue OpWithFlags;
+  if (Opcode == ARMISD::ADDC) {
+    // Use ADDC: LHS + RHS (where RHS was 0 - X, now X)
+    OpWithFlags = DAG.getNode(ARMISD::ADDC, dl,
+                              DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+  } else {
+    // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
+    OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
+                              DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+  }
+
+  SDValue OpResult = OpWithFlags.getValue(0); // The operation result
+  SDValue Flags = OpWithFlags.getValue(1);    // The flags
+
+  // Constants for conditional moves
+  SDValue One = DAG.getConstant(1, dl, MVT::i32);
+  SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
+
+  // Select condition codes based on signed vs unsigned
+  ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
+  ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
+
+  // First conditional move: if greater than, set to 1
+  SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
+  SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
+                                GTCondValue, Flags);
+
+  // Second conditional move: if less than, set to -1
+  SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
+  SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
+                                LTCondValue, Flags);
+
+  if (Op.getValueType() != MVT::i32)
+    Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
+
+  return Result2;
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
   switch (Op.getOpcode()) {
@@ -10740,6 +10705,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FP_TO_BF16:
     return LowerFP_TO_BF16(Op, DAG);
   case ARMISD::WIN__DBZCHK: return SDValue();
+  case ISD::UCMP:
+  case ISD::SCMP:
+    return LowerCMP(Op, DAG);
   }
 }
 
@@ -20627,12 +20595,10 @@ static TargetLowering::ArgListTy getDivRemArgList(
   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
                   N->getOpcode() == ISD::SREM;
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     EVT ArgVT = N->getOperand(i).getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
-    Entry.Node = N->getOperand(i);
-    Entry.Ty = ArgTy;
+    TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
     Entry.IsSExt = isSigned;
     Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
@@ -21605,7 +21571,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
 bool ARMTargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -21615,7 +21581,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   auto *LI = dyn_cast<LoadInst>(Load);
   if (!LI)
     return false;
-  assert(!Mask && "Unexpected mask on a load");
+  assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
 
   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
   Type *EltTy = VecTy->getElementType();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 825145d..778595e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -607,6 +607,8 @@ class VectorType;
 
     bool preferZeroCompareBranch() const override { return true; }
 
+    bool shouldExpandCmpUsingSelects(EVT VT) const override;
+
     bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
 
     bool hasAndNotCompare(SDValue V) const override {
@@ -683,8 +685,8 @@ class VectorType;
 
     bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
-                              ArrayRef<unsigned> Indices,
-                              unsigned Factor) const override;
+                              ArrayRef<unsigned> Indices, unsigned Factor,
+                              const APInt &GapMask) const override;
     bool lowerInterleavedStore(Instruction *Store, Value *Mask,
                                ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
@@ -904,6 +906,7 @@ class VectorType;
     void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
                    SelectionDAG &DAG) const;
     SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
 
     Register getRegisterByName(const char* RegName, LLT VT,
                                const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index b4677a8..ebfa593 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -89,19 +89,15 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
     AlignVariant = ALIGN1;
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-  Entry.Node = Dst;
-  Args.push_back(Entry);
+  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  Args.emplace_back(Dst, IntPtrTy);
   if (AEABILibcall == AEABI_MEMCLR) {
-    Entry.Node = Size;
-    Args.push_back(Entry);
+    Args.emplace_back(Size, IntPtrTy);
   } else if (AEABILibcall == AEABI_MEMSET) {
     // Adjust parameters for memset, EABI uses format (ptr, size, value),
     // GNU library uses (ptr, value, size)
     // See RTABI section 4.3.4
-    Entry.Node = Size;
-    Args.push_back(Entry);
+    Args.emplace_back(Size, IntPtrTy);
 
     // Extend or truncate the argument to be an i32 value for the call.
     if (Src.getValueType().bitsGT(MVT::i32))
@@ -109,16 +105,13 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
     else if (Src.getValueType().bitsLT(MVT::i32))
       Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
 
-    Entry.Node = Src;
-    Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+    TargetLowering::ArgListEntry Entry(Src,
+                                       Type::getInt32Ty(*DAG.getContext()));
     Entry.IsSExt = false;
     Args.push_back(Entry);
   } else {
-    Entry.Node = Src;
-    Args.push_back(Entry);
-
-    Entry.Node = Size;
-    Args.push_back(Entry);
+    Args.emplace_back(Src, IntPtrTy);
+    Args.emplace_back(Size, IntPtrTy);
   }
 
   static const RTLIB::Libcall FunctionImpls[4][3] = {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 6f37eca..6b28541 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1084,9 +1084,10 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(
                                               CostKind, Op1Info, Op2Info, I);
 }
 
-InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
-                                                      ScalarEvolution *SE,
-                                                      const SCEV *Ptr) const {
+InstructionCost
+ARMTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                      const SCEV *Ptr,
+                                      TTI::TargetCostKind CostKind) const {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -1095,7 +1096,7 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
   int MaxMergeDistance = 64;
 
   if (ST->hasNEON()) {
-    if (Ty->isVectorTy() && SE &&
+    if (PtrTy->isVectorTy() && SE &&
         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
       return NumVectorInstToHideOverhead;
 
@@ -1103,7 +1104,7 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
     // addressing mode.
     return 1;
   }
-  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
+  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
 }
 
 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const {
@@ -1335,6 +1336,39 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
     if (!Mask.empty()) {
       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
+      // Check for LD2/LD4 instructions, which are represented in llvm IR as
+      // deinterleaving-shuffle(load). The shuffle cost could potentially be
+      // free, but we model it with a cost of LT.first so that LD2/LD4 have a
+      // higher cost than just the load.
+      if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
+          (LT.second.getScalarSizeInBits() == 8 ||
+           LT.second.getScalarSizeInBits() == 16 ||
+           LT.second.getScalarSizeInBits() == 32) &&
+          LT.second.getSizeInBits() == 128 &&
+          ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
+            ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2)) ||
+           (TLI->getMaxSupportedInterleaveFactor() == 4 &&
+            ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))))
+        return ST->getMVEVectorCostFactor(CostKind) *
+               std::max<InstructionCost>(1, LT.first / 4);
+
+      // Check for ST2/ST4 instructions, which are represented in llvm IR as
+      // store(interleaving-shuffle). The shuffle cost could potentially be
+      // free, but we model it with a cost of LT.first so that ST2/ST4 have a
+      // higher cost than just the store.
+      if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
+          (LT.second.getScalarSizeInBits() == 8 ||
+           LT.second.getScalarSizeInBits() == 16 ||
+           LT.second.getScalarSizeInBits() == 32) &&
+          LT.second.getSizeInBits() == 128 &&
+          ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
+            ShuffleVectorInst::isInterleaveMask(
+                Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
+           (TLI->getMaxSupportedInterleaveFactor() == 4 &&
+            ShuffleVectorInst::isInterleaveMask(
+                Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
+        return ST->getMVEVectorCostFactor(CostKind) * LT.first;
+
       if (LT.second.isVector() &&
           Mask.size() <= LT.second.getVectorNumElements() &&
           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 522c235..cdd8bcb 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -257,8 +257,9 @@ public:
                                      unsigned Index, const Value *Op0,
                                      const Value *Op1) const override;
 
-  InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
-                                            const SCEV *Ptr) const override;
+  InstructionCost
+  getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr,
+                            TTI::TargetCostKind CostKind) const override;
 
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ece6c10..0e97483 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -3373,12 +3373,12 @@ public:
 
   void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask())));
+    Inst.addOperand(MCOperand::createImm(getMSRMask()));
   }
 
   void addBankedRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(unsigned(getBankedReg())));
+    Inst.addOperand(MCOperand::createImm(getBankedReg()));
   }
 
   void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 8ee3a2d..a5266a9 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -20,7 +20,6 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 0b4e7df..5eeb4fe 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -922,7 +922,7 @@ bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
   // the function.
   unsigned LastVPTImm = 0;
   Register LastVPTReg = 0;
-  SmallSet<MachineInstr *, 4> DeadInstructions;
+  SmallPtrSet<MachineInstr *, 4> DeadInstructions;
 
   for (MachineInstr &Instr : MBB.instrs()) {
     // Look for predicated MVE instructions.
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 25ad9ec..545bc3a 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -505,10 +505,9 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   SDValue InChain = DAG.getEntryNode();
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   for (SDValue const &Value : Op->op_values()) {
-    Entry.Node = Value;
-    Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext());
+    TargetLowering::ArgListEntry Entry(
+        Value, Value.getValueType().getTypeForEVT(*DAG.getContext()));
     Entry.IsSExt = IsSigned;
     Entry.IsZExt = !IsSigned;
     Args.push_back(Entry);
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h
index 2ae22b2..301ce9c 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -94,6 +94,8 @@ public:
     return ShiftLegalizationStrategy::LowerToLibcall;
   }
 
+  bool softPromoteHalfType() const override { return true; }
+
 private:
   SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
                     SelectionDAG &DAG, SDLoc dl) const;
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index b75417a..fbd1484 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -20,6 +20,7 @@
 #include "AVR.h"
 #include "AVRMachineFunctionInfo.h"
 #include "AVRTargetObjectFile.h"
+#include "AVRTargetTransformInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 #include "TargetInfo/AVRTargetInfo.h"
 
@@ -28,7 +29,7 @@
 namespace llvm {
 
 static const char *AVRDataLayout =
-    "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8";
+    "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8:16-a:8";
 
 /// Processes a CPU name.
 static StringRef getCPU(StringRef CPU) {
@@ -62,7 +63,9 @@ namespace {
 class AVRPassConfig : public TargetPassConfig {
 public:
   AVRPassConfig(AVRTargetMachine &TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {
+    EnableLoopTermFold = true;
+  }
 
   AVRTargetMachine &getAVRTargetMachine() const {
     return getTM<AVRTargetMachine>();
@@ -107,6 +110,11 @@ const AVRSubtarget *AVRTargetMachine::getSubtargetImpl(const Function &) const {
   return &SubTarget;
 }
 
+TargetTransformInfo
+AVRTargetMachine::getTargetTransformInfo(const Function &F) const {
+  return TargetTransformInfo(std::make_unique<AVRTTIImpl>(this, F));
+}
+
 MachineFunctionInfo *AVRTargetMachine::createMachineFunctionInfo(
     BumpPtrAllocator &Allocator, const Function &F,
     const TargetSubtargetInfo *STI) const {
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.h b/llvm/lib/Target/AVR/AVRTargetMachine.h
index 167d007..9452b3d 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.h
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.h
@@ -48,6 +48,8 @@ public:
   createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
                             const TargetSubtargetInfo *STI) const override;
 
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
+
   bool isNoopAddrSpaceCast(unsigned SrcAs, unsigned DestAs) const override {
     // While AVR has different address spaces, they are all represented by
     // 16-bit pointers that can be freely casted between (of course, a pointer
diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp b/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp
new file mode 100644
index 0000000..b1ef380
--- /dev/null
+++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp
@@ -0,0 +1,24 @@
+//===-- AVRTargetTransformInfo.cpp - AVR specific TTI ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetTransformInfo.h"
+
+using namespace llvm;
+
+bool AVRTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                               const TargetTransformInfo::LSRCost &C2) const {
+  // AVR specific here are "instruction number 1st priority".
+  // If we need to emit adds inside the loop to add up base registers, then
+  // we need at least one extra temporary register.
+  unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
+  unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
+  return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost, C1.NumIVMuls,
+                  C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+         std::tie(C2.Insns, C2NumRegs, C2.AddRecCost, C2.NumIVMuls,
+                  C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+}
diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
new file mode 100644
index 0000000..0daeeb8
--- /dev/null
+++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
@@ -0,0 +1,51 @@
+//===- AVRTargetTransformInfo.h - AVR specific TTI --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines a TargetTransformInfoImplBase conforming object specific
+/// to the AVR target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AVR_AVRTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AVR_AVRTARGETTRANSFORMINFO_H
+
+#include "AVRSubtarget.h"
+#include "AVRTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/Function.h"
+#include <optional>
+
+namespace llvm {
+
+class AVRTTIImpl final : public BasicTTIImplBase<AVRTTIImpl> {
+  using BaseT = BasicTTIImplBase<AVRTTIImpl>;
+  using TTI = TargetTransformInfo;
+
+  friend BaseT;
+
+  const AVRSubtarget *ST;
+  const AVRTargetLowering *TLI;
+
+  const AVRSubtarget *getST() const { return ST; }
+  const AVRTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit AVRTTIImpl(const AVRTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                     const TargetTransformInfo::LSRCost &C2) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AVR_AVRTARGETTRANSFORMINFO_H
diff --git a/llvm/lib/Target/AVR/CMakeLists.txt b/llvm/lib/Target/AVR/CMakeLists.txt
index 781dac0..a31c545 100644
--- a/llvm/lib/Target/AVR/CMakeLists.txt
+++ b/llvm/lib/Target/AVR/CMakeLists.txt
@@ -29,11 +29,13 @@ add_llvm_target(AVRCodeGen
   AVRSubtarget.cpp
   AVRTargetMachine.cpp
   AVRTargetObjectFile.cpp
+  AVRTargetTransformInfo.cpp
 
   DEPENDS
   intrinsics_gen
 
   LINK_COMPONENTS
+  Analysis
   AVRDesc
   AVRInfo
   AsmPrinter
@@ -44,6 +46,8 @@ add_llvm_target(AVRCodeGen
   SelectionDAG
   Support
   Target
+  TargetParser
+  TransformUtils
 
   ADD_TO_COMPONENT
   AVR
diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
index e55d9b2..7885d93 100644
--- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
+++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
@@ -116,7 +116,7 @@ class CSKYConstantIslands : public MachineFunctionPass {
 
   /// NewWaterList - The subset of WaterList that was created since the
   /// previous iteration by inserting unconditional branches.
-  SmallSet<MachineBasicBlock *, 4> NewWaterList;
+  SmallPtrSet<MachineBasicBlock *, 4> NewWaterList;
 
   using water_iterator = std::vector<MachineBasicBlock *>::iterator;
 
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index 7070171..e5b4f6e 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -1329,10 +1329,7 @@ SDValue CSKYTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
 
   // Prepare argument list to generate call.
   ArgListTy Args;
-  ArgListEntry Entry;
-  Entry.Node = Load;
-  Entry.Ty = CallTy;
-  Args.push_back(Entry);
+  Args.emplace_back(Load, CallTy);
 
   // Setup call to __tls_get_addr.
   TargetLowering::CallLoweringInfo CLI(DAG);
diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index c7c09ca..8100f94 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -49,6 +49,7 @@ add_llvm_target(DirectXCodeGen
   DirectXInfo
   DirectXPointerTypeAnalysis
   FrontendHLSL
+  IPO
   MC
   ScalarOpts
   SelectionDAG
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 26a113d..a1ef257 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -162,8 +162,7 @@ void DXContainerGlobals::addRootSignature(Module &M,
 
   auto &RSA = getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
   const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry;
-  const std::optional<mcdxbc::RootSignatureDesc> &RS =
-      RSA.getDescForFunction(EntryFunction);
+  const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction);
 
   if (!RS)
     return;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 492e078..c65ead4 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1108,11 +1108,11 @@ def RawBufferStore : DXILOp<140, rawBufferStore> {
 def Dot2AddHalf : DXILOp<162, dot2AddHalf> {
   let Doc = "2D half dot product with accumulate to float";
   let intrinsics = [IntrinSelect<int_dx_dot2add>];
-  let arguments = [FloatTy, HalfTy, HalfTy, HalfTy, HalfTy];
-  let result = FloatTy;
-  let overloads = [Overloads<DXIL1_0, []>];
-  let stages = [Stages<DXIL1_0, [all_stages]>];
-  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+  let arguments = [OverloadTy, HalfTy, HalfTy, HalfTy, HalfTy];
+  let result = OverloadTy;
+  let overloads = [Overloads<DXIL1_4, [FloatTy]>];
+  let stages = [Stages<DXIL1_4, [all_stages]>];
+  let attributes = [Attributes<DXIL1_4, [ReadNone]>];
 }
 
 def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
index 5f331db..13e3408 100644
--- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
+++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
@@ -20,13 +20,13 @@ using namespace llvm;
 static bool finalizeLinkage(Module &M) {
   bool MadeChange = false;
 
-  // Convert private global variables to internal linkage.
-  for (GlobalVariable &GV : M.globals()) {
-    if (GV.hasPrivateLinkage()) {
+  // Convert private globals and external globals with no usage to internal
+  // linkage.
+  for (GlobalVariable &GV : M.globals())
+    if (GV.hasPrivateLinkage() || (GV.hasExternalLinkage() && GV.use_empty())) {
       GV.setLinkage(GlobalValue::InternalLinkage);
       MadeChange = true;
     }
-  }
 
   SmallVector<Function *> Funcs;
 
diff --git a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
index 306db6a..695eacb 100644
--- a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
+++ b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
@@ -9,10 +9,13 @@
 #include "DXILForwardHandleAccesses.h"
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/DXILResource.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsDirectX.h"
@@ -70,6 +73,7 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) {
 
   DenseMap<GlobalVariable *, IntrinsicInst *> HandleMap;
   SmallVector<LoadInst *> LoadsToProcess;
+  DenseMap<AllocaInst *, SmallVector<IntrinsicInst *>> LifeTimeIntrinsicMap;
   for (BasicBlock &BB : F)
     for (Instruction &Inst : BB)
       if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
@@ -78,6 +82,14 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) {
         case Intrinsic::dx_resource_handlefromimplicitbinding:
           processHandle(II, HandleMap);
           break;
+        case Intrinsic::lifetime_start:
+        case Intrinsic::lifetime_end:
+          if (II->arg_size() >= 1) {
+            Value *Ptr = II->getArgOperand(0);
+            if (auto *Alloca = dyn_cast<AllocaInst>(Ptr))
+              LifeTimeIntrinsicMap[Alloca].push_back(II);
+          }
+          break;
         default:
           continue;
         }
@@ -98,8 +110,16 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) {
             NestedLI, NestedLI->getParent(), BBI, 0, nullptr, nullptr);
         GV = dyn_cast_or_null<GlobalVariable>(Loaded);
       } else if (auto *NestedAlloca = dyn_cast<AllocaInst>(V)) {
-        for (auto &Use : NestedAlloca->uses()) {
-          auto *Store = dyn_cast<StoreInst>(Use.getUser());
+
+        if (auto It = LifeTimeIntrinsicMap.find(NestedAlloca);
+            It != LifeTimeIntrinsicMap.end()) {
+          llvm::for_each(It->second,
+                         [](IntrinsicInst *II) { II->eraseFromParent(); });
+          LifeTimeIntrinsicMap.erase(It);
+        }
+
+        for (auto *User : NestedAlloca->users()) {
+          auto *Store = dyn_cast<StoreInst>(User);
           if (!Store)
             continue;
 
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 0ec15a6..bd421771 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -9,6 +9,7 @@
 #include "DXILOpLowering.h"
 #include "DXILConstants.h"
 #include "DXILOpBuilder.h"
+#include "DXILRootSignature.h"
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
 #include "llvm/ADT/SmallVector.h"
@@ -746,7 +747,7 @@ public:
     IRBuilder<> &IRB = OpBuilder.getIRB();
     return replaceFunction(F, [&](CallInst *CI) -> Error {
       IRB.SetInsertPoint(CI);
-      Value *Ptr = CI->getArgOperand(1);
+      Value *Ptr = CI->getArgOperand(0);
       assert(Ptr->getType()->isPointerTy() &&
              "Expected operand of lifetime intrinsic to be a pointer");
 
@@ -918,6 +919,7 @@ PreservedAnalyses DXILOpLowering::run(Module &M, ModuleAnalysisManager &MAM) {
   PA.preserve<DXILResourceAnalysis>();
   PA.preserve<DXILMetadataAnalysis>();
   PA.preserve<ShaderFlagsAnalysis>();
+  PA.preserve<RootSignatureAnalysis>();
   return PA;
 }
 
@@ -945,6 +947,7 @@ public:
     AU.addPreserved<DXILResourceWrapperPass>();
     AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
     AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+    AU.addPreserved<RootSignatureAnalysisWrapper>();
   }
 };
 char DXILOpLoweringLegacy::ID = 0;
diff --git a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
index 398dcbb..be2c7d1 100644
--- a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
+++ b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "DXILPostOptimizationValidation.h"
+#include "DXILRootSignature.h"
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
 #include "llvm/ADT/SmallString.h"
@@ -17,13 +18,44 @@
 #include "llvm/IR/IntrinsicsDirectX.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/DXILABI.h"
 
 #define DEBUG_TYPE "dxil-post-optimization-validation"
 
 using namespace llvm;
 using namespace llvm::dxil;
 
-namespace {
+static ResourceClass toResourceClass(dxbc::DescriptorRangeType RangeType) {
+  using namespace dxbc;
+  switch (RangeType) {
+  case DescriptorRangeType::SRV:
+    return ResourceClass::SRV;
+  case DescriptorRangeType::UAV:
+    return ResourceClass::UAV;
+  case DescriptorRangeType::CBV:
+    return ResourceClass::CBuffer;
+  case DescriptorRangeType::Sampler:
+    return ResourceClass::Sampler;
+  }
+  llvm_unreachable("Unknown DescriptorRangeType");
+}
+
+static ResourceClass toResourceClass(dxbc::RootParameterType Type) {
+  using namespace dxbc;
+  switch (Type) {
+  case RootParameterType::Constants32Bit:
+    return ResourceClass::CBuffer;
+  case RootParameterType::SRV:
+    return ResourceClass::SRV;
+  case RootParameterType::UAV:
+    return ResourceClass::UAV;
+  case RootParameterType::CBV:
+    return ResourceClass::CBuffer;
+  case dxbc::RootParameterType::DescriptorTable:
+    llvm_unreachable("DescriptorTable is not convertible to ResourceClass");
+  }
+  llvm_unreachable("Unknown RootParameterType");
+}
 
 static void reportInvalidDirection(Module &M, DXILResourceMap &DRM) {
   for (const auto &UAV : DRM.uavs()) {
@@ -63,9 +95,7 @@ static void reportOverlappingError(Module &M, ResourceInfo R1,
 }
 
 static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) {
-  if (DRM.empty())
-    return;
-
+  bool ErrorFound = false;
   for (const auto &ResList :
        {DRM.srvs(), DRM.uavs(), DRM.cbuffers(), DRM.samplers()}) {
     if (ResList.empty())
@@ -77,15 +107,136 @@ static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) {
       while (RI != ResList.end() &&
              PrevRI->getBinding().overlapsWith(RI->getBinding())) {
         reportOverlappingError(M, *PrevRI, *RI);
+        ErrorFound = true;
         RI++;
       }
       PrevRI = CurrentRI;
     }
   }
+  assert(ErrorFound && "this function should be called only when if "
+                       "DXILResourceBindingInfo::hasOverlapingBinding() is "
+                       "true, yet no overlapping binding was found");
+}
+
+static void
+reportOverlappingRegisters(Module &M,
+                           const llvm::hlsl::BindingInfoBuilder::Binding &R1,
+                           const llvm::hlsl::BindingInfoBuilder::Binding &R2) {
+  SmallString<128> Message;
+
+  raw_svector_ostream OS(Message);
+  OS << "resource " << getResourceClassName(R1.RC) << " (space=" << R1.Space
+     << ", registers=[" << R1.LowerBound << ", " << R1.UpperBound
+     << "]) overlaps with resource " << getResourceClassName(R2.RC)
+     << " (space=" << R2.Space << ", registers=[" << R2.LowerBound << ", "
+     << R2.UpperBound << "])";
+  M.getContext().diagnose(DiagnosticInfoGeneric(Message));
+}
+
+static dxbc::ShaderVisibility
+tripleToVisibility(llvm::Triple::EnvironmentType ET) {
+  switch (ET) {
+  case Triple::Pixel:
+    return dxbc::ShaderVisibility::Pixel;
+  case Triple::Vertex:
+    return dxbc::ShaderVisibility::Vertex;
+  case Triple::Geometry:
+    return dxbc::ShaderVisibility::Geometry;
+  case Triple::Hull:
+    return dxbc::ShaderVisibility::Hull;
+  case Triple::Domain:
+    return dxbc::ShaderVisibility::Domain;
+  case Triple::Mesh:
+    return dxbc::ShaderVisibility::Mesh;
+  case Triple::Compute:
+    return dxbc::ShaderVisibility::All;
+  default:
+    llvm_unreachable("Invalid triple to shader stage conversion");
+  }
+}
+
+static void validateRootSignature(Module &M,
+                                  const mcdxbc::RootSignatureDesc &RSD,
+                                  dxil::ModuleMetadataInfo &MMI) {
+
+  hlsl::BindingInfoBuilder Builder;
+  dxbc::ShaderVisibility Visibility = tripleToVisibility(MMI.ShaderProfile);
+
+  for (const mcdxbc::RootParameterInfo &ParamInfo : RSD.ParametersContainer) {
+    dxbc::ShaderVisibility ParamVisibility =
+        static_cast<dxbc::ShaderVisibility>(ParamInfo.Header.ShaderVisibility);
+    if (ParamVisibility != dxbc::ShaderVisibility::All &&
+        ParamVisibility != Visibility)
+      continue;
+    dxbc::RootParameterType ParamType =
+        static_cast<dxbc::RootParameterType>(ParamInfo.Header.ParameterType);
+    switch (ParamType) {
+    case dxbc::RootParameterType::Constants32Bit: {
+      dxbc::RTS0::v1::RootConstants Const =
+          RSD.ParametersContainer.getConstant(ParamInfo.Location);
+      Builder.trackBinding(dxil::ResourceClass::CBuffer, Const.RegisterSpace,
+                           Const.ShaderRegister, Const.ShaderRegister,
+                           &ParamInfo);
+      break;
+    }
+
+    case dxbc::RootParameterType::SRV:
+    case dxbc::RootParameterType::UAV:
+    case dxbc::RootParameterType::CBV: {
+      dxbc::RTS0::v2::RootDescriptor Desc =
+          RSD.ParametersContainer.getRootDescriptor(ParamInfo.Location);
+      Builder.trackBinding(toResourceClass(static_cast<dxbc::RootParameterType>(
+                               ParamInfo.Header.ParameterType)),
+                           Desc.RegisterSpace, Desc.ShaderRegister,
+                           Desc.ShaderRegister, &ParamInfo);
+
+      break;
+    }
+    case dxbc::RootParameterType::DescriptorTable: {
+      const mcdxbc::DescriptorTable &Table =
+          RSD.ParametersContainer.getDescriptorTable(ParamInfo.Location);
+
+      for (const dxbc::RTS0::v2::DescriptorRange &Range : Table.Ranges) {
+        uint32_t UpperBound =
+            Range.NumDescriptors == ~0U
+                ? Range.BaseShaderRegister
+                : Range.BaseShaderRegister + Range.NumDescriptors - 1;
+        Builder.trackBinding(
+            toResourceClass(
+                static_cast<dxbc::DescriptorRangeType>(Range.RangeType)),
+            Range.RegisterSpace, Range.BaseShaderRegister, UpperBound,
+            &ParamInfo);
+      }
+      break;
+    }
+    }
+  }
+
+  for (const dxbc::RTS0::v1::StaticSampler &S : RSD.StaticSamplers)
+    Builder.trackBinding(dxil::ResourceClass::Sampler, S.RegisterSpace,
+                         S.ShaderRegister, S.ShaderRegister, &S);
+
+  Builder.calculateBindingInfo(
+      [&M](const llvm::hlsl::BindingInfoBuilder &Builder,
+           const llvm::hlsl::BindingInfoBuilder::Binding &ReportedBinding) {
+        const llvm::hlsl::BindingInfoBuilder::Binding &Overlaping =
+            Builder.findOverlapping(ReportedBinding);
+        reportOverlappingRegisters(M, ReportedBinding, Overlaping);
+      });
+}
+
+static mcdxbc::RootSignatureDesc *
+getRootSignature(RootSignatureBindingInfo &RSBI,
+                 dxil::ModuleMetadataInfo &MMI) {
+  if (MMI.EntryPropertyVec.size() == 0)
+    return nullptr;
+  return RSBI.getDescForFunction(MMI.EntryPropertyVec[0].Entry);
 }
 
 static void reportErrors(Module &M, DXILResourceMap &DRM,
-                         DXILResourceBindingInfo &DRBI) {
+                         DXILResourceBindingInfo &DRBI,
+                         RootSignatureBindingInfo &RSBI,
+                         dxil::ModuleMetadataInfo &MMI) {
   if (DRM.hasInvalidCounterDirection())
     reportInvalidDirection(M, DRM);
 
@@ -94,14 +245,19 @@ static void reportErrors(Module &M, DXILResourceMap &DRM,
 
   assert(!DRBI.hasImplicitBinding() && "implicit bindings should be handled in "
                                        "DXILResourceImplicitBinding pass");
+
+  if (mcdxbc::RootSignatureDesc *RSD = getRootSignature(RSBI, MMI))
+    validateRootSignature(M, *RSD, MMI);
 }
-} // namespace
 
 PreservedAnalyses
 DXILPostOptimizationValidation::run(Module &M, ModuleAnalysisManager &MAM) {
   DXILResourceMap &DRM = MAM.getResult<DXILResourceAnalysis>(M);
   DXILResourceBindingInfo &DRBI = MAM.getResult<DXILResourceBindingAnalysis>(M);
-  reportErrors(M, DRM, DRBI);
+  RootSignatureBindingInfo &RSBI = MAM.getResult<RootSignatureAnalysis>(M);
+  ModuleMetadataInfo &MMI = MAM.getResult<DXILMetadataAnalysis>(M);
+
+  reportErrors(M, DRM, DRBI, RSBI, MMI);
   return PreservedAnalyses::all();
 }
 
@@ -113,7 +269,12 @@ public:
         getAnalysis<DXILResourceWrapperPass>().getResourceMap();
     DXILResourceBindingInfo &DRBI =
         getAnalysis<DXILResourceBindingWrapperPass>().getBindingInfo();
-    reportErrors(M, DRM, DRBI);
+    RootSignatureBindingInfo &RSBI =
+        getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
+    dxil::ModuleMetadataInfo &MMI =
+        getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
+
+    reportErrors(M, DRM, DRBI, RSBI, MMI);
     return false;
   }
   StringRef getPassName() const override {
@@ -125,10 +286,13 @@ public:
   void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
     AU.addRequired<DXILResourceWrapperPass>();
     AU.addRequired<DXILResourceBindingWrapperPass>();
+    AU.addRequired<DXILMetadataAnalysisWrapperPass>();
+    AU.addRequired<RootSignatureAnalysisWrapper>();
     AU.addPreserved<DXILResourceWrapperPass>();
     AU.addPreserved<DXILResourceBindingWrapperPass>();
     AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
     AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+    AU.addPreserved<RootSignatureAnalysisWrapper>();
   }
 };
 char DXILPostOptimizationValidationLegacy::ID = 0;
@@ -139,6 +303,8 @@ INITIALIZE_PASS_BEGIN(DXILPostOptimizationValidationLegacy, DEBUG_TYPE,
 INITIALIZE_PASS_DEPENDENCY(DXILResourceBindingWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
 INITIALIZE_PASS_END(DXILPostOptimizationValidationLegacy, DEBUG_TYPE,
                     "DXIL Post Optimization Validation", false, false)
 
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index 254b7ff..b990b6c 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -43,13 +43,11 @@ public:
 
   iterator end() { return FuncToRsMap.end(); }
 
-  std::optional<mcdxbc::RootSignatureDesc>
-  getDescForFunction(const Function *F) {
+  mcdxbc::RootSignatureDesc *getDescForFunction(const Function *F) {
     const auto FuncRs = find(F);
     if (FuncRs == end())
-      return std::nullopt;
-
-    return FuncRs->second;
+      return nullptr;
+    return &FuncRs->second;
   }
 };
 
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
index 1bd5dd7..1eb03bf 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
@@ -13,11 +13,15 @@
 #include "DXILWriterPass.h"
 #include "DXILBitcodeWriter.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
@@ -54,49 +58,81 @@ public:
 };
 
 static void legalizeLifetimeIntrinsics(Module &M) {
-  for (Function &F : M) {
-    Intrinsic::ID IID = F.getIntrinsicID();
-    if (IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+  LLVMContext &Ctx = M.getContext();
+  Type *I64Ty = IntegerType::get(Ctx, 64);
+  Type *PtrTy = PointerType::get(Ctx, 0);
+  Intrinsic::ID LifetimeIIDs[2] = {Intrinsic::lifetime_start,
+                                   Intrinsic::lifetime_end};
+  for (Intrinsic::ID &IID : LifetimeIIDs) {
+    Function *F = M.getFunction(Intrinsic::getName(IID, {PtrTy}, &M));
+    if (!F)
       continue;
 
-    // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
-    F.removeFnAttr(Attribute::Memory);
-
-    // Lifetime intrinsics in LLVM 3.7 do not have mangled names
-    F.setName(Intrinsic::getBaseName(IID));
-
-    // LLVM 3.7 Lifetime intrinics require an i8* operand, so we insert bitcasts
-    // to ensure that is the case
-    for (auto *User : make_early_inc_range(F.users())) {
-      CallInst *CI = dyn_cast<CallInst>(User);
-      assert(CI && "Expected user of a lifetime intrinsic function to be a "
-                   "lifetime intrinsic call");
-      Value *PtrOperand = CI->getArgOperand(1);
-      PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
+    // Get or insert an LLVM 3.7-compliant lifetime intrinsic function of the
+    // form `void @llvm.lifetime.[start/end](i64, ptr)` with the NoUnwind
+    // attribute
+    AttributeList Attr;
+    Attr = Attr.addFnAttribute(Ctx, Attribute::NoUnwind);
+    FunctionCallee LifetimeCallee = M.getOrInsertFunction(
+        Intrinsic::getBaseName(IID), Attr, Type::getVoidTy(Ctx), I64Ty, PtrTy);
+
+    // Replace all calls to lifetime intrinsics with calls to the
+    // LLVM 3.7-compliant version of the lifetime intrinsic
+    for (User *U : make_early_inc_range(F->users())) {
+      CallInst *CI = dyn_cast<CallInst>(U);
+      assert(CI &&
+             "Expected user of a lifetime intrinsic function to be a CallInst");
+
+      // LLVM 3.7 lifetime intrinics require an i8* operand, so we insert
+      // a bitcast to ensure that is the case
+      Value *PtrOperand = CI->getArgOperand(0);
+      PointerType *PtrOpPtrTy = cast<PointerType>(PtrOperand->getType());
       Value *NoOpBitCast = CastInst::Create(Instruction::BitCast, PtrOperand,
-                                            PtrTy, "", CI->getIterator());
-      CI->setArgOperand(1, NoOpBitCast);
+                                            PtrOpPtrTy, "", CI->getIterator());
+
+      // LLVM 3.7 lifetime intrinsics have an explicit size operand, whose value
+      // we can obtain from the pointer operand which must be an AllocaInst (as
+      // of https://github.com/llvm/llvm-project/pull/149310)
+      AllocaInst *AI = dyn_cast<AllocaInst>(PtrOperand);
+      assert(AI &&
+             "The pointer operand of a lifetime intrinsic call must be an "
+             "AllocaInst");
+      std::optional<TypeSize> AllocSize =
+          AI->getAllocationSize(CI->getDataLayout());
+      assert(AllocSize.has_value() &&
+             "Expected the allocation size of AllocaInst to be known");
+      CallInst *NewCI = CallInst::Create(
+          LifetimeCallee,
+          {ConstantInt::get(I64Ty, AllocSize.value().getFixedValue()),
+           NoOpBitCast},
+          "", CI->getIterator());
+      for (Attribute ParamAttr : CI->getParamAttributes(0))
+        NewCI->addParamAttr(1, ParamAttr);
+
+      CI->eraseFromParent();
     }
+
+    F->eraseFromParent();
   }
 }
 
 static void removeLifetimeIntrinsics(Module &M) {
-  for (Function &F : make_early_inc_range(M)) {
-    if (Intrinsic::ID IID = F.getIntrinsicID();
-        IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+  Intrinsic::ID LifetimeIIDs[2] = {Intrinsic::lifetime_start,
+                                   Intrinsic::lifetime_end};
+  for (Intrinsic::ID &IID : LifetimeIIDs) {
+    Function *F = M.getFunction(Intrinsic::getBaseName(IID));
+    if (!F)
       continue;
 
-    for (User *U : make_early_inc_range(F.users())) {
-      LifetimeIntrinsic *LI = dyn_cast<LifetimeIntrinsic>(U);
-      assert(LI && "Expected user of lifetime intrinsic function to be "
-                   "a LifetimeIntrinsic instruction");
-      BitCastInst *BCI = dyn_cast<BitCastInst>(LI->getArgOperand(1));
-      assert(BCI && "Expected pointer operand of LifetimeIntrinsic to be a "
-                    "BitCastInst");
-      LI->eraseFromParent();
+    for (User *U : make_early_inc_range(F->users())) {
+      CallInst *CI = dyn_cast<CallInst>(U);
+      assert(CI && "Expected user of lifetime function to be a CallInst");
+      BitCastInst *BCI = dyn_cast<BitCastInst>(CI->getArgOperand(1));
+      assert(BCI && "Expected pointer operand of CallInst to be a BitCastInst");
+      CI->eraseFromParent();
       BCI->eraseFromParent();
     }
-    F.eraseFromParent();
+    F->eraseFromParent();
   }
 }
 
diff --git a/llvm/lib/Target/DirectX/DirectXPassRegistry.def b/llvm/lib/Target/DirectX/DirectXPassRegistry.def
index d506954..b4b48a16 100644
--- a/llvm/lib/Target/DirectX/DirectXPassRegistry.def
+++ b/llvm/lib/Target/DirectX/DirectXPassRegistry.def
@@ -24,6 +24,7 @@ MODULE_ANALYSIS("dxil-root-signature-analysis", dxil::RootSignatureAnalysis())
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
 MODULE_PASS("dxil-cbuffer-access", DXILCBufferAccess())
+MODULE_PASS("dxil-finalize-linkage", DXILFinalizeLinkage())
 MODULE_PASS("dxil-data-scalarization", DXILDataScalarization())
 MODULE_PASS("dxil-flatten-arrays", DXILFlattenArrays())
 MODULE_PASS("dxil-intrinsic-expansion", DXILIntrinsicExpansion())
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 84751d2..f5d5a73 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -14,6 +14,7 @@
 #include "DirectXTargetMachine.h"
 #include "DXILCBufferAccess.h"
 #include "DXILDataScalarization.h"
+#include "DXILFinalizeLinkage.h"
 #include "DXILFlattenArrays.h"
 #include "DXILForwardHandleAccesses.h"
 #include "DXILIntrinsicExpansion.h"
@@ -45,6 +46,8 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/Scalarizer.h"
 #include <optional>
 
@@ -62,6 +65,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   initializeEmbedDXILPassPass(*PR);
   initializeWriteDXILPassPass(*PR);
   initializeDXContainerGlobalsPass(*PR);
+  initializeGlobalDCELegacyPassPass(*PR);
   initializeDXILOpLoweringLegacyPass(*PR);
   initializeDXILResourceAccessLegacyPass(*PR);
   initializeDXILResourceImplicitBindingLegacyPass(*PR);
@@ -72,6 +76,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   initializeDXILFinalizeLinkageLegacyPass(*PR);
   initializeDXILPrettyPrinterLegacyPass(*PR);
   initializeDXILForwardHandleAccessesLegacyPass(*PR);
+  initializeDSELegacyPassPass(*PR);
   initializeDXILCBufferAccessLegacyPass(*PR);
 }
 
@@ -103,6 +108,7 @@ public:
   FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
   void addCodeGenPrepare() override {
     addPass(createDXILFinalizeLinkageLegacyPass());
+    addPass(createGlobalDCEPass());
     addPass(createDXILResourceAccessLegacyPass());
     addPass(createDXILIntrinsicExpansionLegacyPass());
     addPass(createDXILCBufferAccessLegacyPass());
@@ -112,6 +118,7 @@ public:
     addPass(createScalarizerPass(DxilScalarOptions));
     addPass(createDXILFlattenArraysLegacyPass());
     addPass(createDXILForwardHandleAccessesLegacyPass());
+    addPass(createDeadStoreEliminationPass());
     addPass(createDXILLegalizeLegacyPass());
     addPass(createDXILResourceImplicitBindingLegacyPass());
     addPass(createDXILTranslateMetadataLegacyPass());
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 22cff7c..bcddb54 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -526,6 +526,9 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     MI.insert(MI.begin() + 1,
               MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
     break;
+  case Hexagon::Y4_crswap10:
+    MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0));
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
index e0302b8..fd6d873 100644
--- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -6,11 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-class CCIfArgIsVarArg<CCAction A>
-  : CCIf<"State.isVarArg() && "
-         "ValNo >= static_cast<HexagonCCState&>(State)"
-         ".getNumNamedVarArgParams()", A>;
-
 def CC_HexagonStack: CallingConv<[
   CCIfType<[i32,v2i16,v4i8],
     CCAssignToStack<4,4>>,
@@ -28,7 +23,7 @@ def CC_Hexagon_Legacy: CallingConv<[
 
   CCIfByVal<
     CCPassByVal<8,8>>,
-  CCIfArgIsVarArg<
+  CCIfArgVarArg<
     CCDelegateTo<CC_HexagonStack>>,
 
   // Pass split values in pairs, allocate odd register if necessary.
@@ -58,7 +53,7 @@ def CC_Hexagon: CallingConv<[
 
   CCIfByVal<
     CCPassByVal<8,1>>,
-  CCIfArgIsVarArg<
+  CCIfArgVarArg<
     CCDelegateTo<CC_HexagonStack>>,
 
   // Pass split values in pairs, allocate odd register if necessary.
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index a920146..b2218ab 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -1273,7 +1273,7 @@ void HexagonGenInsert::selectCandidates() {
 
   for (unsigned R = AllRMs.find_first(); R; R = AllRMs.find_next(R)) {
     using use_iterator = MachineRegisterInfo::use_nodbg_iterator;
-    using InstrSet = SmallSet<const MachineInstr *, 16>;
+    using InstrSet = SmallPtrSet<const MachineInstr *, 16>;
 
     InstrSet UIs;
     // Count as the number of instructions in which R is used, not the
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index facea64..c54b67c 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -116,23 +116,6 @@ static cl::opt<bool>
                             cl::desc("Disable minimum alignment of 1 for "
                                      "arguments passed by value on stack"));
 
-namespace {
-
-  class HexagonCCState : public CCState {
-    unsigned NumNamedVarArgParams = 0;
-
-  public:
-    HexagonCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
-                   SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
-                   unsigned NumNamedArgs)
-        : CCState(CC, IsVarArg, MF, locs, C),
-          NumNamedVarArgParams(NumNamedArgs) {}
-    unsigned getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
-  };
-
-} // end anonymous namespace
-
-
 // Implement calling convention for Hexagon.
 
 static bool CC_SkipOdd(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
@@ -497,7 +480,6 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto PtrVT = getPointerTy(MF.getDataLayout());
 
-  unsigned NumParams = CLI.CB ? CLI.CB->getFunctionType()->getNumParams() : 0;
   if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, MVT::i32);
 
@@ -506,8 +488,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext(),
-                        NumParams);
+  CCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext());
 
   if (Subtarget.useHVXOps())
     CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_HVX);
@@ -880,9 +861,7 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs,
-                        *DAG.getContext(),
-                        MF.getFunction().getFunctionType()->getNumParams());
+  CCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext());
 
   if (Subtarget.useHVXOps())
     CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon_HVX);
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index c34eecd..a3717bb 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -2289,7 +2289,7 @@ CleanupAndExit:
 // the instructions in Insts are removed.
 bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
       SmallVectorImpl<Instruction*> &Insts) const {
-  SmallSet<BasicBlock*,8> LoopBlocks;
+  SmallPtrSet<BasicBlock *, 8> LoopBlocks;
   LoopBlocks.insert_range(L->blocks());
 
   SetVector<Instruction *> Worklist(llvm::from_range, Insts);
diff --git a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 610a81f..33aa6e4 100644
--- a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -32,14 +32,10 @@ SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy(
   //
   const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-  Entry.Node = Dst;
-  Args.push_back(Entry);
-  Entry.Node = Src;
-  Args.push_back(Entry);
-  Entry.Node = Size;
-  Args.push_back(Entry);
+  Type *ArgTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  Args.emplace_back(Dst, ArgTy);
+  Args.emplace_back(Src, ArgTy);
+  Args.emplace_back(Size, ArgTy);
 
   const char *SpecialMemcpyName = TLI.getLibcallName(
       RTLIB::HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES);
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index ecc1b5d..6a05b5a 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -445,8 +445,8 @@ void HexagonSubtarget::adjustSchedDependency(
   const HexagonInstrInfo *QII = getInstrInfo();
 
   // Instructions with .new operands have zero latency.
-  SmallSet<SUnit *, 4> ExclSrc;
-  SmallSet<SUnit *, 4> ExclDst;
+  SmallPtrSet<SUnit *, 4> ExclSrc;
+  SmallPtrSet<SUnit *, 4> ExclDst;
   if (QII->canExecuteInBundle(*SrcInst, *DstInst) &&
       isBestZeroLatency(Src, Dst, QII, ExclSrc, ExclDst)) {
     Dep.setLatency(0);
@@ -630,9 +630,9 @@ static SUnit *getZeroLatency(SUnit *N, SmallVector<SDep, 4> &Deps) {
 // together with a zero latency. Only one dependence should have a zero
 // latency. If there are multiple choices, choose the best, and change
 // the others, if needed.
-bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
-      const HexagonInstrInfo *TII, SmallSet<SUnit*, 4> &ExclSrc,
-      SmallSet<SUnit*, 4> &ExclDst) const {
+bool HexagonSubtarget::isBestZeroLatency(
+    SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII,
+    SmallPtrSet<SUnit *, 4> &ExclSrc, SmallPtrSet<SUnit *, 4> &ExclDst) const {
   MachineInstr &SrcInst = *Src->getInstr();
   MachineInstr &DstInst = *Dst->getInstr();
 
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 41555db..b111471 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -366,7 +366,8 @@ private:
   void restoreLatency(SUnit *Src, SUnit *Dst) const;
   void changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat) const;
   bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII,
-      SmallSet<SUnit*, 4> &ExclSrc, SmallSet<SUnit*, 4> &ExclDst) const;
+                         SmallPtrSet<SUnit *, 4> &ExclSrc,
+                         SmallPtrSet<SUnit *, 4> &ExclDst) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 9fb7d47..171e294 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -156,9 +156,10 @@ HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
-InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *Tp,
-                                                          ScalarEvolution *SE,
-                                                          const SCEV *S) const {
+InstructionCost
+HexagonTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                          const SCEV *S,
+                                          TTI::TargetCostKind CostKind) const {
   return 0;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index af8dede7..dbf16c9 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -111,8 +111,9 @@ public:
   InstructionCost
   getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                         TTI::TargetCostKind CostKind) const override;
-  InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
-                                            const SCEV *S) const override;
+  InstructionCost
+  getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *S,
+                            TTI::TargetCostKind CostKind) const override;
   InstructionCost getMemoryOpCost(
       unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
       TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 3de6df5..87d052b 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -1677,9 +1677,9 @@ auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
     return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
   };
 
-  const APInt *Qn = nullptr;
-  if (Value * T; match(Exp, m_Shr(m_Value(T), m_APInt(Qn)))) {
-    Op.Frac = Qn->getZExtValue();
+  uint64_t Qn = 0;
+  if (Value *T; match(Exp, m_Shr(m_Value(T), m_ConstantInt(Qn)))) {
+    Op.Frac = Qn;
     Exp = T;
   } else {
     Op.Frac = 0;
@@ -1689,9 +1689,9 @@ auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
     return std::nullopt;
 
   // Check if there is rounding added.
-  const APInt *C = nullptr;
-  if (Value * T; Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_APInt(C)))) {
-    uint64_t CV = C->getZExtValue();
+  uint64_t CV;
+  if (Value *T;
+      Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_ConstantInt(CV)))) {
     if (CV != 0 && !isPowerOf2_64(CV))
       return std::nullopt;
     if (CV != 0)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 039ef4f..6b8d7f1 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -32,7 +32,6 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/HexagonAttributes.h"
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index d23c5f4..7a0a510 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -356,12 +356,13 @@ void LanaiTargetLowering::LowerAsmOperandForConstraint(
 static unsigned NumFixedArgs;
 static bool CC_Lanai32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                               CCValAssign::LocInfo LocInfo,
-                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
+                              ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                              CCState &State) {
   // Handle fixed arguments with default CC.
   // Note: Both the default and fast CC handle VarArg the same and hence the
   // calling convention of the function is not considered here.
   if (ValNo < NumFixedArgs) {
-    return CC_Lanai32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+    return CC_Lanai32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, OrigTy, State);
   }
 
   // Promote i8/i16 args to i32
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 6583a0f..5b2d185 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -2786,7 +2787,7 @@ SDValue LoongArchTargetLowering::lowerUINT_TO_FP(SDValue Op,
   EVT RetVT = Op.getValueType();
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(OpVT, RetVT);
   MakeLibCallOptions CallOptions;
-  CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true);
+  CallOptions.setTypeListBeforeSoften(OpVT, RetVT);
   SDValue Chain = SDValue();
   SDValue Result;
   std::tie(Result, Chain) =
@@ -2811,7 +2812,7 @@ SDValue LoongArchTargetLowering::lowerSINT_TO_FP(SDValue Op,
   EVT RetVT = Op.getValueType();
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(OpVT, RetVT);
   MakeLibCallOptions CallOptions;
-  CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true);
+  CallOptions.setTypeListBeforeSoften(OpVT, RetVT);
   SDValue Chain = SDValue();
   SDValue Result;
   std::tie(Result, Chain) =
@@ -3037,10 +3038,7 @@ SDValue LoongArchTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
 
   // Prepare argument list to generate call.
   ArgListTy Args;
-  ArgListEntry Entry;
-  Entry.Node = Load;
-  Entry.Ty = CallTy;
-  Args.push_back(Entry);
+  Args.emplace_back(Load, CallTy);
 
   // Setup call to __tls_get_addr.
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -4107,7 +4105,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
     LC = RTLIB::getFPTOSINT(Src.getValueType(), VT);
     MakeLibCallOptions CallOptions;
     EVT OpVT = Src.getValueType();
-    CallOptions.setTypeListBeforeSoften(OpVT, VT, true);
+    CallOptions.setTypeListBeforeSoften(OpVT, VT);
     SDValue Chain = SDValue();
     SDValue Result;
     std::tie(Result, Chain) =
@@ -4360,7 +4358,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
     RTLIB::Libcall LC =
         OpVT == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32;
     MakeLibCallOptions CallOptions;
-    CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true);
+    CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64);
     SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first;
     Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result);
     Results.push_back(Result);
@@ -6042,17 +6040,20 @@ static MachineBasicBlock *
 emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
                      const LoongArchSubtarget &Subtarget) {
   unsigned InsOp;
+  unsigned BroadcastOp;
   unsigned HalfSize;
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode");
   case LoongArch::PseudoXVINSGR2VR_B:
     HalfSize = 16;
-    InsOp = LoongArch::VINSGR2VR_B;
+    BroadcastOp = LoongArch::XVREPLGR2VR_B;
+    InsOp = LoongArch::XVEXTRINS_B;
     break;
   case LoongArch::PseudoXVINSGR2VR_H:
     HalfSize = 8;
-    InsOp = LoongArch::VINSGR2VR_H;
+    BroadcastOp = LoongArch::XVREPLGR2VR_H;
+    InsOp = LoongArch::XVEXTRINS_H;
     break;
   }
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
@@ -6066,37 +6067,41 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
   Register Elt = MI.getOperand(2).getReg();
   unsigned Idx = MI.getOperand(3).getImm();
 
-  Register ScratchReg1 = XSrc;
-  if (Idx >= HalfSize) {
-    ScratchReg1 = MRI.createVirtualRegister(RC);
-    BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1)
-        .addReg(XSrc)
-        .addImm(14);
-  }
+  if (XSrc.isVirtual() && MRI.getVRegDef(XSrc)->isImplicitDef() &&
+      Idx < HalfSize) {
+    Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC);
+    Register ScratchSubReg2 = MRI.createVirtualRegister(SubRC);
 
-  Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC);
-  Register ScratchSubReg2 = MRI.createVirtualRegister(SubRC);
-  BuildMI(*BB, MI, DL, TII->get(LoongArch::COPY), ScratchSubReg1)
-      .addReg(ScratchReg1, 0, LoongArch::sub_128);
-  BuildMI(*BB, MI, DL, TII->get(InsOp), ScratchSubReg2)
-      .addReg(ScratchSubReg1)
-      .addReg(Elt)
-      .addImm(Idx >= HalfSize ? Idx - HalfSize : Idx);
+    BuildMI(*BB, MI, DL, TII->get(LoongArch::COPY), ScratchSubReg1)
+        .addReg(XSrc, 0, LoongArch::sub_128);
+    BuildMI(*BB, MI, DL,
+            TII->get(HalfSize == 8 ? LoongArch::VINSGR2VR_H
+                                   : LoongArch::VINSGR2VR_B),
+            ScratchSubReg2)
+        .addReg(ScratchSubReg1)
+        .addReg(Elt)
+        .addImm(Idx);
+
+    BuildMI(*BB, MI, DL, TII->get(LoongArch::SUBREG_TO_REG), XDst)
+        .addImm(0)
+        .addReg(ScratchSubReg2)
+        .addImm(LoongArch::sub_128);
+  } else {
+    Register ScratchReg1 = MRI.createVirtualRegister(RC);
+    Register ScratchReg2 = MRI.createVirtualRegister(RC);
 
-  Register ScratchReg2 = XDst;
-  if (Idx >= HalfSize)
-    ScratchReg2 = MRI.createVirtualRegister(RC);
+    BuildMI(*BB, MI, DL, TII->get(BroadcastOp), ScratchReg1).addReg(Elt);
 
-  BuildMI(*BB, MI, DL, TII->get(LoongArch::SUBREG_TO_REG), ScratchReg2)
-      .addImm(0)
-      .addReg(ScratchSubReg2)
-      .addImm(LoongArch::sub_128);
+    BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg2)
+        .addReg(ScratchReg1)
+        .addReg(XSrc)
+        .addImm(Idx >= HalfSize ? 48 : 18);
 
-  if (Idx >= HalfSize)
-    BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), XDst)
+    BuildMI(*BB, MI, DL, TII->get(InsOp), XDst)
         .addReg(XSrc)
         .addReg(ScratchReg2)
-        .addImm(2);
+        .addImm((Idx >= HalfSize ? Idx - HalfSize : Idx) * 17);
+  }
 
   MI.eraseFromParent();
   return BB;
@@ -7073,7 +7078,8 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
 
 static bool CC_LoongArch_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                              CCValAssign::LocInfo LocInfo,
-                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
+                             ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                             CCState &State) {
   if (LocVT == MVT::i32 || LocVT == MVT::i64) {
     // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, SpLim
     //                        s0    s1  s2  s3  s4  s5  s6  s7  s8
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index d8bb16f..0696b11 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1640,6 +1640,24 @@ defm : PairInsertExtractPatV8<v8f32, f32>;
 defm : PairInsertExtractPatV4<v4i64, GRLenVT>;
 defm : PairInsertExtractPatV4<v4f64, f64>;
 
+def : Pat<(vector_insert v8i32:$xd, (GRLenVT(vector_extract v8i32:$xj, 0)),
+              uimm3:$imm),
+          (XVINSVE0_W v8i32:$xd, v8i32:$xj, uimm3:$imm)>;
+
+def : Pat<(vector_insert v4i64:$xd, (GRLenVT(vector_extract v4i64:$xj, 0)),
+              uimm2:$imm),
+          (XVINSVE0_D v4i64:$xd, v4i64:$xj, uimm2:$imm)>;
+
+def : Pat<(vector_insert v8i32:$xd,
+              (GRLenVT(vector_extract v8i32:$xj, uimm3:$imm1)), uimm3:$imm2),
+          (XVINSVE0_W v8i32:$xd, (XVPICKVE_W v8i32:$xj, uimm3:$imm1),
+              uimm3:$imm2)>;
+
+def : Pat<(vector_insert v4i64:$xd,
+              (GRLenVT(vector_extract v4i64:$xj, uimm2:$imm1)), uimm2:$imm2),
+          (XVINSVE0_D v4i64:$xd, (XVPICKVE_D v4i64:$xj, uimm2:$imm1),
+              uimm2:$imm2)>;
+
 // PseudoXVINSGR2VR_{B/H}
 def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
           (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index ca5d27d..3b38ac9 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -143,8 +143,6 @@ static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data,
 void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
                                      const MCValue &Target, uint8_t *Data,
                                      uint64_t Value, bool IsResolved) {
-  if (IsResolved && shouldForceRelocation(Fixup, Target))
-    IsResolved = false;
   IsResolved = addReloc(F, Fixup, Target, Value, IsResolved);
   if (!Value)
     return; // Doesn't change encoding.
@@ -176,20 +174,6 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
   }
 }
 
-bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
-                                                const MCValue &Target) {
-  switch (Fixup.getKind()) {
-  default:
-    return STI.hasFeature(LoongArch::FeatureRelax);
-  case FK_Data_1:
-  case FK_Data_2:
-  case FK_Data_4:
-  case FK_Data_8:
-  case FK_Data_leb128:
-    return !Target.isAbsolute();
-  }
-}
-
 static inline std::pair<MCFixupKind, MCFixupKind>
 getRelocPairForSize(unsigned Size) {
   switch (Size) {
@@ -216,10 +200,19 @@ getRelocPairForSize(unsigned Size) {
 // size, the fixup encodes MaxBytesToEmit in the higher bits and references a
 // per-section marker symbol.
 bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
+  // Alignments before the first linker-relaxable instruction have fixed sizes
+  // and do not require relocations. Alignments after a linker-relaxable
+  // instruction require a relocation, even if the STI specifies norelax.
+  //
+  // firstLinkerRelaxable is the layout order within the subsection, which may
+  // be smaller than the section's order. Therefore, alignments in a
+  // lower-numbered subsection may be unnecessarily treated as linker-relaxable.
+  auto *Sec = F.getParent();
+  if (F.getLayoutOrder() <= Sec->firstLinkerRelaxable())
+    return false;
+
   // Use default handling unless linker relaxation is enabled and the
   // MaxBytesToEmit >= the nop size.
-  if (!F.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
-    return false;
   const unsigned MinNopLen = 4;
   unsigned MaxBytesToEmit = F.getAlignMaxBytesToEmit();
   if (MaxBytesToEmit < MinNopLen)
@@ -254,8 +247,6 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
       MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN);
   F.setVarFixups({Fixup});
   F.setLinkerRelaxable();
-  if (!F.getParent()->isLinkerRelaxable())
-    F.getParent()->setFirstLinkerRelaxable(F.getLayoutOrder());
   return true;
 }
 
@@ -448,10 +439,10 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
           isPCRelFixupResolved(Target.getSubSym(), F))
         return Fallback();
 
-      // In SecA == SecB case. If the linker relaxation is disabled, the
+      // In SecA == SecB case. If the section is not linker-relaxable, the
       // FixedValue has already been calculated out in evaluateFixup,
       // return true and avoid record relocations.
-      if (&SecA == &SecB && !STI.hasFeature(LoongArch::FeatureRelax))
+      if (&SecA == &SecB && !SecA.isLinkerRelaxable())
         return true;
     }
 
@@ -484,9 +475,16 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
     return false;
   }
 
-  IsResolved = Fallback();
   // If linker relaxation is enabled and supported by the current relocation,
-  // append a RELAX relocation.
+  // generate a relocation and then append a RELAX.
+  if (Fixup.isLinkerRelaxable())
+    IsResolved = false;
+  if (IsResolved && Fixup.isPCRel())
+    IsResolved = isPCRelFixupResolved(Target.getAddSym(), F);
+
+  if (!IsResolved)
+    Asm->getWriter().recordRelocation(F, Fixup, Target, FixedValue);
+
   if (Fixup.isLinkerRelaxable()) {
     auto FA = MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_LARCH_RELAX);
     Asm->getWriter().recordRelocation(F, FA, MCValue::get(nullptr),
@@ -498,8 +496,7 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
 
 std::unique_ptr<MCObjectTargetWriter>
 LoongArchAsmBackend::createObjectTargetWriter() const {
-  return createLoongArchELFObjectWriter(
-      OSABI, Is64Bit, STI.hasFeature(LoongArch::FeatureRelax));
+  return createLoongArchELFObjectWriter(OSABI, Is64Bit);
 }
 
 MCAsmBackend *llvm::createLoongArchAsmBackend(const Target &T,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index 1f13601..f79d3aa 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -44,8 +44,6 @@ public:
   void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
                   uint8_t *Data, uint64_t Value, bool IsResolved) override;
 
-  bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target);
-
   std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
 
   MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index 7e021e4..7d54565 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -21,26 +21,23 @@ using namespace llvm;
 namespace {
 class LoongArchELFObjectWriter : public MCELFObjectTargetWriter {
 public:
-  LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool EnableRelax);
+  LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit);
 
   ~LoongArchELFObjectWriter() override;
 
   bool needsRelocateWithSymbol(const MCValue &, unsigned Type) const override {
-    return EnableRelax;
+    return true;
   }
 
 protected:
   unsigned getRelocType(const MCFixup &, const MCValue &,
                         bool IsPCRel) const override;
-  bool EnableRelax;
 };
 } // end namespace
 
-LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit,
-                                                   bool EnableRelax)
+LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit)
     : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH,
-                              /*HasRelocationAddend=*/true),
-      EnableRelax(EnableRelax) {}
+                              /*HasRelocationAddend=*/true) {}
 
 LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {}
 
@@ -103,6 +100,6 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup,
 }
 
 std::unique_ptr<MCObjectTargetWriter>
-llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax) {
-  return std::make_unique<LoongArchELFObjectWriter>(OSABI, Is64Bit, Relax);
+llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) {
+  return std::make_unique<LoongArchELFObjectWriter>(OSABI, Is64Bit);
 }
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
index bb05baa..ab35a00 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
@@ -36,7 +36,7 @@ MCAsmBackend *createLoongArchAsmBackend(const Target &T,
                                         const MCTargetOptions &Options);
 
 std::unique_ptr<MCObjectTargetWriter>
-createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax);
+createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 594ea9f..12c6e1e 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -51,7 +51,9 @@ M68kTargetLowering::M68kTargetLowering(const M68kTargetMachine &TM,
 
   MVT PtrVT = MVT::i32;
 
-  setBooleanContents(ZeroOrOneBooleanContent);
+  // This is based on M68k SetCC (scc) setting the destination byte to all 1s.
+  // See also getSetCCResultType().
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
 
   auto *RegInfo = Subtarget.getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
@@ -1454,10 +1456,7 @@ SDValue M68kTargetLowering::getTLSGetAddr(GlobalAddressSDNode *GA,
   PointerType *PtrTy = PointerType::get(*DAG.getContext(), 0);
 
   ArgListTy Args;
-  ArgListEntry Entry;
-  Entry.Node = Arg;
-  Entry.Ty = PtrTy;
-  Args.push_back(Entry);
+  Args.emplace_back(Arg, PtrTy);
   return LowerExternalSymbolCall(DAG, SDLoc(GA), "__tls_get_addr",
                                  std::move(Args));
 }
diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
index e2d4e49..56b71db 100644
--- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td
+++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
@@ -835,7 +835,7 @@ def : Pat<(MxSub 0, i8 :$src), (NEG8d  MxDRD8 :$src)>;
 def : Pat<(MxSub 0, i16:$src), (NEG16d MxDRD16:$src)>;
 def : Pat<(MxSub 0, i32:$src), (NEG32d MxDRD32:$src)>;
 // SExt of i1 values.
-// Although we specify `ZeroOrOneBooleanContent` for boolean content,
+// Although we specify `ZeroOrNegativeOneBooleanContent` for boolean content,
 // we're still adding an AND here as we don't know the origin of the i1 value.
 def : Pat<(sext_inreg i8:$src,  i1), (NEG8d  (AND8di  MxDRD8:$src,  1))>;
 def : Pat<(sext_inreg i16:$src, i1), (NEG16d (AND16di MxDRD16:$src, 1))>;
diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td
index f4ed627..c5b7ae3 100644
--- a/llvm/lib/Target/M68k/M68kInstrData.td
+++ b/llvm/lib/Target/M68k/M68kInstrData.td
@@ -701,18 +701,22 @@ def: Pat<(MxExtLoadi16i8 MxCP_ARID:$src),
           (EXTRACT_SUBREG (MOVZXd32p8 MxARID8:$src), MxSubRegIndex16Lo)>;
 def: Pat<(MxExtLoadi16i8 MxCP_ARII:$src),
           (EXTRACT_SUBREG (MOVZXd32f8 MxARII8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxExtLoadi16i8 MxCP_PCD:$src),
+          (EXTRACT_SUBREG (MOVZXd32q8 MxPCD8:$src), MxSubRegIndex16Lo)>;
 
 // i32 <- anyext i8
 def: Pat<(i32 (anyext i8:$src)), (MOVZXd32d8 MxDRD8:$src)>;
 def: Pat<(MxExtLoadi32i8 MxCP_ARI :$src), (MOVZXd32j8 MxARI8 :$src)>;
 def: Pat<(MxExtLoadi32i8 MxCP_ARID:$src), (MOVZXd32p8 MxARID8:$src)>;
 def: Pat<(MxExtLoadi32i8 MxCP_ARII:$src), (MOVZXd32f8 MxARII8:$src)>;
+def: Pat<(MxExtLoadi32i8 MxCP_PCD:$src), (MOVZXd32q8 MxPCD8:$src)>;
 
 // i32 <- anyext i16
 def: Pat<(i32 (anyext i16:$src)), (MOVZXd32d16 MxDRD16:$src)>;
 def: Pat<(MxExtLoadi32i16 MxCP_ARI :$src), (MOVZXd32j16 MxARI16 :$src)>;
 def: Pat<(MxExtLoadi32i16 MxCP_ARID:$src), (MOVZXd32p16 MxARID16:$src)>;
 def: Pat<(MxExtLoadi32i16 MxCP_ARII:$src), (MOVZXd32f16 MxARII16:$src)>;
+def: Pat<(MxExtLoadi32i16 MxCP_PCD:$src), (MOVZXd32q16 MxPCD16:$src)>;
 
 // trunc patterns
 def : Pat<(i16 (trunc i32:$src)),
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index d23504c..6da5e66 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -377,6 +377,7 @@ static void AnalyzeArguments(CCState &State,
   for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
     MVT ArgVT = Args[ValNo].VT;
     ISD::ArgFlagsTy ArgFlags = Args[ValNo].Flags;
+    Type *OrigTy = Args[ValNo].OrigTy;
     MVT LocVT = ArgVT;
     CCValAssign::LocInfo LocInfo = CCValAssign::Full;
 
@@ -411,7 +412,8 @@ static void AnalyzeArguments(CCState &State,
       RegsLeft -= 1;
 
       UsedStack = true;
-      CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+      CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, OrigTy,
+                            State);
     } else if (Parts <= RegsLeft) {
       for (unsigned j = 0; j < Parts; j++) {
         MCRegister Reg = State.AllocateReg(RegList);
@@ -421,7 +423,8 @@ static void AnalyzeArguments(CCState &State,
     } else {
       UsedStack = true;
       for (unsigned j = 0; j < Parts; j++)
-        CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+        CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, OrigTy,
+                              State);
     }
   }
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 16247bd..680d279 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index feeadc5e..a8b7c9e 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -18,7 +18,6 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 5df70c4..1e1b970 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -26,7 +26,6 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 4530fc6..ae91c97 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -51,7 +51,6 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp
index 13237c5..86bb3e6 100644
--- a/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -12,59 +12,6 @@
 
 using namespace llvm;
 
-bool MipsCCState::isF128SoftLibCall(const char *CallSym) {
-  const char *const LibCalls[] = {
-      "__addtf3",      "__divtf3",     "__eqtf2",       "__extenddftf2",
-      "__extendsftf2", "__fixtfdi",    "__fixtfsi",     "__fixtfti",
-      "__fixunstfdi",  "__fixunstfsi", "__fixunstfti",  "__floatditf",
-      "__floatsitf",   "__floattitf",  "__floatunditf", "__floatunsitf",
-      "__floatuntitf", "__getf2",      "__gttf2",       "__letf2",
-      "__lttf2",       "__multf3",     "__netf2",       "__powitf2",
-      "__subtf3",      "__trunctfdf2", "__trunctfsf2",  "__unordtf2",
-      "ceill",         "copysignl",    "cosl",          "exp2l",
-      "expl",          "floorl",       "fmal",          "fmaxl",
-      "fmodl",         "log10l",       "log2l",         "logl",
-      "nearbyintl",    "powl",         "rintl",         "roundl",
-      "sinl",          "sqrtl",        "truncl"};
-
-  // Check that LibCalls is sorted alphabetically.
-  auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
-  assert(llvm::is_sorted(LibCalls, Comp));
-  return llvm::binary_search(LibCalls, CallSym, Comp);
-}
-
-/// This function returns true if Ty is fp128, {f128} or i128 which was
-/// originally a fp128.
-bool MipsCCState::originalTypeIsF128(const Type *Ty, const char *Func) {
-  if (Ty->isFP128Ty())
-    return true;
-
-  if (Ty->isStructTy() && Ty->getStructNumElements() == 1 &&
-      Ty->getStructElementType(0)->isFP128Ty())
-    return true;
-
-  // If the Ty is i128 and the function being called is a long double emulation
-  // routine, then the original type is f128.
-  // FIXME: This is unsound because these functions could be indirectly called
-  return (Func && Ty->isIntegerTy(128) && isF128SoftLibCall(Func));
-}
-
-/// Return true if the original type was vXfXX.
-bool MipsCCState::originalEVTTypeIsVectorFloat(EVT Ty) {
-  if (Ty.isVector() && Ty.getVectorElementType().isFloatingPoint())
-    return true;
-
-  return false;
-}
-
-/// Return true if the original type was vXfXX / vXfXX.
-bool MipsCCState::originalTypeIsVectorFloat(const Type *Ty) {
-  if (Ty->isVectorTy() && Ty->isFPOrFPVectorTy())
-    return true;
-
-  return false;
-}
-
 MipsCCState::SpecialCallingConvType
 MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
                                             const MipsSubtarget &Subtarget) {
@@ -81,123 +28,3 @@ MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
   }
   return SpecialCallingConv;
 }
-
-void MipsCCState::PreAnalyzeCallResultForF128(
-    const SmallVectorImpl<ISD::InputArg> &Ins,
-    const Type *RetTy, const char *Call) {
-  for (unsigned i = 0; i < Ins.size(); ++i) {
-    OriginalArgWasF128.push_back(
-        originalTypeIsF128(RetTy, Call));
-    OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy());
-  }
-}
-
-/// Identify lowered values that originated from f128 or float arguments and
-/// record this for use by RetCC_MipsN.
-void MipsCCState::PreAnalyzeCallReturnForF128(
-    const SmallVectorImpl<ISD::OutputArg> &Outs, const Type *RetTy) {
-  for (unsigned i = 0; i < Outs.size(); ++i) {
-    OriginalArgWasF128.push_back(
-        originalTypeIsF128(RetTy, nullptr));
-    OriginalArgWasFloat.push_back(
-        RetTy->isFloatingPointTy());
-  }
-}
-
-/// Identify lower values that originated from vXfXX and record
-/// this.
-void MipsCCState::PreAnalyzeCallResultForVectorFloat(
-    const SmallVectorImpl<ISD::InputArg> &Ins, const Type *RetTy) {
-  for (unsigned i = 0; i < Ins.size(); ++i) {
-    OriginalRetWasFloatVector.push_back(originalTypeIsVectorFloat(RetTy));
-  }
-}
-
-/// Identify lowered values that originated from vXfXX arguments and record
-/// this.
-void MipsCCState::PreAnalyzeReturnForVectorFloat(
-    const SmallVectorImpl<ISD::OutputArg> &Outs) {
-  for (unsigned i = 0; i < Outs.size(); ++i) {
-    ISD::OutputArg Out = Outs[i];
-    OriginalRetWasFloatVector.push_back(
-        originalEVTTypeIsVectorFloat(Out.ArgVT));
-  }
-}
-
-void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) {
-  OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT));
-}
-
-void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) {
-  OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func));
-  OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy());
-  OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy());
-}
-
-/// Identify lowered values that originated from f128, float and sret to vXfXX
-/// arguments and record this.
-void MipsCCState::PreAnalyzeCallOperands(
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    std::vector<TargetLowering::ArgListEntry> &FuncArgs,
-    const char *Func) {
-  for (unsigned i = 0; i < Outs.size(); ++i) {
-    TargetLowering::ArgListEntry FuncArg = FuncArgs[Outs[i].OrigArgIndex];
-
-    OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func));
-    OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy());
-    OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy());
-  }
-}
-
-void MipsCCState::PreAnalyzeFormalArgument(const Type *ArgTy,
-                                           ISD::ArgFlagsTy Flags) {
-  // SRet arguments cannot originate from f128 or {f128} returns so we just
-  // push false. We have to handle this specially since SRet arguments
-  // aren't mapped to an original argument.
-  if (Flags.isSRet()) {
-    OriginalArgWasF128.push_back(false);
-    OriginalArgWasFloat.push_back(false);
-    OriginalArgWasFloatVector.push_back(false);
-    return;
-  }
-
-  OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, nullptr));
-  OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy());
-
-  // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the
-  // first argument is actually an SRet pointer to a vector, then the next
-  // argument slot is $a2.
-  OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy());
-}
-
-/// Identify lowered values that originated from f128, float and vXfXX arguments
-/// and record this.
-void MipsCCState::PreAnalyzeFormalArgumentsForF128(
-    const SmallVectorImpl<ISD::InputArg> &Ins) {
-  const MachineFunction &MF = getMachineFunction();
-  for (unsigned i = 0; i < Ins.size(); ++i) {
-    Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
-
-    // SRet arguments cannot originate from f128 or {f128} returns so we just
-    // push false. We have to handle this specially since SRet arguments
-    // aren't mapped to an original argument.
-    if (Ins[i].Flags.isSRet()) {
-      OriginalArgWasF128.push_back(false);
-      OriginalArgWasFloat.push_back(false);
-      OriginalArgWasFloatVector.push_back(false);
-      continue;
-    }
-
-    assert(Ins[i].getOrigArgIndex() < MF.getFunction().arg_size());
-    std::advance(FuncArg, Ins[i].getOrigArgIndex());
-
-    OriginalArgWasF128.push_back(
-        originalTypeIsF128(FuncArg->getType(), nullptr));
-    OriginalArgWasFloat.push_back(FuncArg->getType()->isFloatingPointTy());
-
-    // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the
-    // first argument is actually an SRet pointer to a vector, then the next
-    // argument slot is $a2.
-    OriginalArgWasFloatVector.push_back(FuncArg->getType()->isVectorTy());
-  }
-}
diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h
index 30b68e8..4c36d42 100644
--- a/llvm/lib/Target/Mips/MipsCCState.h
+++ b/llvm/lib/Target/Mips/MipsCCState.h
@@ -26,66 +26,7 @@ public:
   getSpecialCallingConvForCallee(const SDNode *Callee,
                                  const MipsSubtarget &Subtarget);
 
-  /// This function returns true if CallSym is a long double emulation routine.
-  ///
-  /// FIXME: Changing the ABI based on the callee name is unsound. The lib func
-  /// address could be captured.
-  static bool isF128SoftLibCall(const char *CallSym);
-
-  static bool originalTypeIsF128(const Type *Ty, const char *Func);
-  static bool originalEVTTypeIsVectorFloat(EVT Ty);
-  static bool originalTypeIsVectorFloat(const Type *Ty);
-
-  void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func);
-
-  void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags);
-  void PreAnalyzeReturnValue(EVT ArgVT);
-
 private:
-  /// Identify lowered values that originated from f128 arguments and record
-  /// this for use by RetCC_MipsN.
-  void PreAnalyzeCallResultForF128(const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   const Type *RetTy, const char * Func);
-
-  /// Identify lowered values that originated from f128 arguments and record
-  /// this for use by RetCC_MipsN.
-  void PreAnalyzeCallReturnForF128(const SmallVectorImpl<ISD::OutputArg> &Outs, const Type *RetTy);
-
-  /// Identify lowered values that originated from f128 arguments and record
-  /// this.
-  void
-  PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                         std::vector<TargetLowering::ArgListEntry> &FuncArgs,
-                         const char *Func);
-
-  /// Identify lowered values that originated from f128 arguments and record
-  /// this for use by RetCC_MipsN.
-  void
-  PreAnalyzeFormalArgumentsForF128(const SmallVectorImpl<ISD::InputArg> &Ins);
-
-  void
-  PreAnalyzeCallResultForVectorFloat(const SmallVectorImpl<ISD::InputArg> &Ins,
-                                     const Type *RetTy);
-
-  void PreAnalyzeFormalArgumentsForVectorFloat(
-      const SmallVectorImpl<ISD::InputArg> &Ins);
-
-  void
-  PreAnalyzeReturnForVectorFloat(const SmallVectorImpl<ISD::OutputArg> &Outs);
-
-  /// Records whether the value has been lowered from an f128.
-  SmallVector<bool, 4> OriginalArgWasF128;
-
-  /// Records whether the value has been lowered from float.
-  SmallVector<bool, 4> OriginalArgWasFloat;
-
-  /// Records whether the value has been lowered from a floating point vector.
-  SmallVector<bool, 4> OriginalArgWasFloatVector;
-
-  /// Records whether the return value has been lowered from a floating point
-  /// vector.
-  SmallVector<bool, 4> OriginalRetWasFloatVector;
-
   // Used to handle MIPS16-specific calling convention tweaks.
   // FIXME: This should probably be a fully fledged calling convention.
   SpecialCallingConvType SpecialCallingConv;
@@ -96,118 +37,6 @@ public:
               SpecialCallingConvType SpecialCC = NoSpecialCallingConv)
       : CCState(CC, isVarArg, MF, locs, C), SpecialCallingConv(SpecialCC) {}
 
-  void PreAnalyzeCallOperands(
-      const SmallVectorImpl<ISD::OutputArg> &Outs, CCAssignFn Fn,
-      std::vector<TargetLowering::ArgListEntry> &FuncArgs, const char *Func) {
-    OriginalArgWasF128.clear();
-    OriginalArgWasFloat.clear();
-    OriginalArgWasFloatVector.clear();
-    PreAnalyzeCallOperands(Outs, FuncArgs, Func);
-  }
-
-  void
-  AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      CCAssignFn Fn,
-                      std::vector<TargetLowering::ArgListEntry> &FuncArgs,
-                      const char *Func) {
-    PreAnalyzeCallOperands(Outs, Fn, FuncArgs, Func);
-    CCState::AnalyzeCallOperands(Outs, Fn);
-  }
-
-  // The AnalyzeCallOperands in the base class is not usable since we must
-  // provide a means of accessing ArgListEntry::IsFixed. Delete them from this
-  // class. This doesn't stop them being used via the base class though.
-  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                           CCAssignFn Fn) = delete;
-  void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
-                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
-                           CCAssignFn Fn) = delete;
-
-  void PreAnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
-                                 CCAssignFn Fn) {
-    OriginalArgWasFloat.clear();
-    OriginalArgWasF128.clear();
-    OriginalArgWasFloatVector.clear();
-    PreAnalyzeFormalArgumentsForF128(Ins);
-  }
-
-  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
-                              CCAssignFn Fn) {
-    PreAnalyzeFormalArguments(Ins, Fn);
-    CCState::AnalyzeFormalArguments(Ins, Fn);
-  }
-
-  void PreAnalyzeCallResult(const Type *RetTy, const char *Func) {
-    OriginalArgWasF128.push_back(originalTypeIsF128(RetTy, Func));
-    OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy());
-    OriginalRetWasFloatVector.push_back(originalTypeIsVectorFloat(RetTy));
-  }
-
-  void PreAnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
-                            CCAssignFn Fn, const Type *RetTy,
-                            const char *Func) {
-    OriginalArgWasFloat.clear();
-    OriginalArgWasF128.clear();
-    OriginalArgWasFloatVector.clear();
-    PreAnalyzeCallResultForF128(Ins, RetTy, Func);
-    PreAnalyzeCallResultForVectorFloat(Ins, RetTy);
-  }
-
-  void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
-                         CCAssignFn Fn, const Type *RetTy,
-                         const char *Func) {
-    PreAnalyzeCallResult(Ins, Fn, RetTy, Func);
-    CCState::AnalyzeCallResult(Ins, Fn);
-  }
-
-  void PreAnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        CCAssignFn Fn) {
-    const MachineFunction &MF = getMachineFunction();
-    OriginalArgWasFloat.clear();
-    OriginalArgWasF128.clear();
-    OriginalArgWasFloatVector.clear();
-    PreAnalyzeCallReturnForF128(Outs, MF.getFunction().getReturnType());
-    PreAnalyzeReturnForVectorFloat(Outs);
-  }
-
-  void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                     CCAssignFn Fn) {
-    PreAnalyzeReturn(Outs, Fn);
-    CCState::AnalyzeReturn(Outs, Fn);
-  }
-
-  bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
-                   CCAssignFn Fn) {
-    const MachineFunction &MF = getMachineFunction();
-    PreAnalyzeCallReturnForF128(ArgsFlags, MF.getFunction().getReturnType());
-    PreAnalyzeReturnForVectorFloat(ArgsFlags);
-    bool Return = CCState::CheckReturn(ArgsFlags, Fn);
-    OriginalArgWasFloat.clear();
-    OriginalArgWasF128.clear();
-    OriginalArgWasFloatVector.clear();
-    return Return;
-  }
-
-  bool CheckCallReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
-                   CCAssignFn Fn, const Type *RetTy) {
-    PreAnalyzeCallReturnForF128(ArgsFlags, RetTy);
-    PreAnalyzeReturnForVectorFloat(ArgsFlags);
-    bool Return = CCState::CheckReturn(ArgsFlags, Fn);
-    OriginalArgWasFloat.clear();
-    OriginalArgWasF128.clear();
-    OriginalArgWasFloatVector.clear();
-    return Return;
-  }
-  bool WasOriginalArgF128(unsigned ValNo) { return OriginalArgWasF128[ValNo]; }
-  bool WasOriginalArgFloat(unsigned ValNo) {
-      return OriginalArgWasFloat[ValNo];
-  }
-  bool WasOriginalArgVectorFloat(unsigned ValNo) const {
-    return OriginalArgWasFloatVector[ValNo];
-  }
-  bool WasOriginalRetVectorFloat(unsigned ValNo) const {
-    return OriginalRetWasFloatVector[ValNo];
-  }
   SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
 };
 }
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp
index fa49108..35194e7 100644
--- a/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -26,62 +26,6 @@ MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI)
     : CallLowering(&TLI) {}
 
 namespace {
-struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
-  /// This is the name of the function being called
-  /// FIXME: Relying on this is unsound
-  const char *Func = nullptr;
-
-  /// Is this a return value, or an outgoing call operand.
-  bool IsReturn;
-
-  MipsOutgoingValueAssigner(CCAssignFn *AssignFn_, const char *Func,
-                            bool IsReturn)
-      : OutgoingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {}
-
-  bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
-                 CCValAssign::LocInfo LocInfo,
-                 const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
-                 CCState &State_) override {
-    MipsCCState &State = static_cast<MipsCCState &>(State_);
-
-    if (IsReturn)
-      State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty));
-    else
-      State.PreAnalyzeCallOperand(Info.Ty, Func);
-
-    return CallLowering::OutgoingValueAssigner::assignArg(
-        ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State);
-  }
-};
-
-struct MipsIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
-  /// This is the name of the function being called
-  /// FIXME: Relying on this is unsound
-  const char *Func = nullptr;
-
-  /// Is this a call return value, or an incoming function argument.
-  bool IsReturn;
-
-  MipsIncomingValueAssigner(CCAssignFn *AssignFn_, const char *Func,
-                            bool IsReturn)
-      : IncomingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {}
-
-  bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
-                 CCValAssign::LocInfo LocInfo,
-                 const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
-                 CCState &State_) override {
-    MipsCCState &State = static_cast<MipsCCState &>(State_);
-
-    if (IsReturn)
-      State.PreAnalyzeCallResult(Info.Ty, Func);
-    else
-      State.PreAnalyzeFormalArgument(Info.Ty, Flags);
-
-    return CallLowering::IncomingValueAssigner::assignArg(
-        ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State);
-  }
-};
-
 class MipsIncomingValueHandler : public CallLowering::IncomingValueHandler {
   const MipsSubtarget &STI;
 
@@ -339,9 +283,7 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                        F.getContext());
 
     MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
-    std::string FuncName = F.getName().str();
-    MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForReturn(),
-                                       FuncName.c_str(), /*IsReturn*/ true);
+    OutgoingValueAssigner Assigner(TLI.CCAssignFnForReturn());
 
     if (!determineAssignments(Assigner, RetInfos, CCInfo))
       return false;
@@ -392,9 +334,7 @@ bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(F.getCallingConv()),
                        Align(1));
 
-  const std::string FuncName = F.getName().str();
-  MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForCall(), FuncName.c_str(),
-                                     /*IsReturn*/ false);
+  IncomingValueAssigner Assigner(TLI.CCAssignFnForCall());
   if (!determineAssignments(Assigner, ArgInfos, CCInfo))
     return false;
 
@@ -510,11 +450,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv),
                        Align(1));
 
-  const char *Call =
-      Info.Callee.isSymbol() ? Info.Callee.getSymbolName() : nullptr;
-
-  MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForCall(), Call,
-                                     /*IsReturn*/ false);
+  OutgoingValueAssigner Assigner(TLI.CCAssignFnForCall());
   if (!determineAssignments(Assigner, ArgInfos, CCInfo))
     return false;
 
@@ -550,11 +486,8 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     CallLowering::splitToValueTypes(Info.OrigRet, ArgInfos, DL,
                                     F.getCallingConv());
 
-    const std::string FuncName = F.getName().str();
     SmallVector<CCValAssign, 8> ArgLocs;
-    MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForReturn(),
-                                       FuncName.c_str(),
-                                       /*IsReturn*/ true);
+    IncomingValueAssigner Assigner(TLI.CCAssignFnForReturn());
     CallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
 
     MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 0e5c16c..3501f9fb 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -20,19 +20,15 @@ class CCIfSubtargetNot<string F, CCAction A> : CCIfSubtarget<F, A, "!">;
 
 /// Match if the original argument (before lowering) was a float.
 /// For example, this is true for i32's that were lowered from soft-float.
-class CCIfOrigArgWasFloat<CCAction A>
-    : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)",
-           A>;
+class CCIfOrigArgWasFloat<CCAction A> : CCIf<"OrigTy->isFloatingPointTy()", A>;
 
 /// Match if the original argument (before lowering) was a 128-bit float (i.e.
 /// long double).
-class CCIfOrigArgWasF128<CCAction A>
-    : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>;
+class CCIfOrigArgWasF128<CCAction A> : CCIf<"OrigTy->isFP128Ty()", A>;
 
-/// Match if the return was a floating point vector.
+/// Match if the return was not a floating point vector.
 class CCIfOrigArgWasNotVectorFloat<CCAction A>
-    : CCIf<"!static_cast<MipsCCState *>(&State)"
-                "->WasOriginalRetVectorFloat(ValNo)", A>;
+    : CCIf<"!OrigTy->isVectorTy() || !OrigTy->isFPOrFPVectorTy()", A>;
 
 /// Match if the special calling conv is the specified value.
 class CCIfSpecialCallingConv<string CC, CCAction A>
diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index 8067dbc..2a2ccf7 100644
--- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -232,7 +232,7 @@ namespace {
 
     /// NewWaterList - The subset of WaterList that was created since the
     /// previous iteration by inserting unconditional branches.
-    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+    SmallPtrSet<MachineBasicBlock *, 4> NewWaterList;
 
     using water_iterator = std::vector<MachineBasicBlock *>::iterator;
 
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index f3812d1..1ce8d7e3 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -266,17 +266,19 @@ public:
 
 static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                    CCState &State) LLVM_ATTRIBUTE_UNUSED;
+                    Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
 
 static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
                             CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+                            ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                            CCState &State) {
   llvm_unreachable("should not be called");
 }
 
 static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
                             CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+                            ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                            CCState &State) {
   llvm_unreachable("should not be called");
 }
 
@@ -1144,8 +1146,12 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
                                    unsigned &NumBytes) {
   CallingConv::ID CC = CLI.CallConv;
   SmallVector<CCValAssign, 16> ArgLocs;
+  SmallVector<Type *, 16> ArgTys;
+  for (const ArgListEntry &Arg : CLI.Args)
+    ArgTys.push_back(Arg.Val->getType());
   CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
-  CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
+  CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, ArgTys,
+                             CCAssignFnForCall(CC));
   // Get a count of how many bytes are to be pushed on the stack.
   NumBytes = CCInfo.getStackSize();
   // This is the minimum argument area used for A0-A3.
@@ -1287,9 +1293,7 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
     SmallVector<CCValAssign, 16> RVLocs;
     MipsCCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
 
-    CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips, CLI.RetTy,
-                             CLI.Symbol ? CLI.Symbol->getName().data()
-                                        : nullptr);
+    CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips);
 
     // Only handle a single return value.
     if (RVLocs.size() != 1)
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 881ba8e..1491300 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -2325,10 +2325,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT);
 
     ArgListTy Args;
-    ArgListEntry Entry;
-    Entry.Node = Argument;
-    Entry.Ty = PtrTy;
-    Args.push_back(Entry);
+    Args.emplace_back(Argument, PtrTy);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(DL)
@@ -3040,14 +3037,13 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
 
 static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
                        CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                       CCState &State, ArrayRef<MCPhysReg> F64Regs) {
+                       Type *OrigTy, CCState &State,
+                       ArrayRef<MCPhysReg> F64Regs) {
   const MipsSubtarget &Subtarget = static_cast<const MipsSubtarget &>(
       State.getMachineFunction().getSubtarget());
 
   static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
 
-  const MipsCCState * MipsState = static_cast<MipsCCState *>(&State);
-
   static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
 
   static const MCPhysReg FloatVectorIntRegs[] = { Mips::A0, Mips::A2 };
@@ -3089,7 +3085,7 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
                                 State.getFirstUnallocated(F32Regs) != ValNo;
   Align OrigAlign = ArgFlags.getNonZeroOrigAlign();
   bool isI64 = (ValVT == MVT::i32 && OrigAlign == Align(8));
-  bool isVectorFloat = MipsState->WasOriginalArgVectorFloat(ValNo);
+  bool isVectorFloat = OrigTy->isVectorTy() && OrigTy->isFPOrFPVectorTy();
 
   // The MIPS vector ABI for floats passes them in a pair of registers
   if (ValVT == MVT::i32 && isVectorFloat) {
@@ -3160,25 +3156,29 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
   return false;
 }
 
-static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
-                            MVT LocVT, CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                            CCState &State) {
   static const MCPhysReg F64Regs[] = { Mips::D6, Mips::D7 };
 
-  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, OrigTy, State,
+                    F64Regs);
 }
 
-static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
-                            MVT LocVT, CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                            CCState &State) {
   static const MCPhysReg F64Regs[] = { Mips::D12_64, Mips::D14_64 };
 
-  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, OrigTy, State,
+                    F64Regs);
 }
 
 static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
                        CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                       CCState &State) LLVM_ATTRIBUTE_UNUSED;
+                       Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
 
 #include "MipsGenCallingConv.inc"
 
@@ -3392,8 +3392,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       MemcpyInByVal ? 0 : ABI.GetCalleeAllocdArgSizeInBytes(CallConv);
   CCInfo.AllocateStack(ReservedArgArea, Align(1));
 
-  CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(),
-                             ES ? ES->getSymbol() : nullptr);
+  CCInfo.AnalyzeCallOperands(Outs, CC_Mips);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned StackSize = CCInfo.getStackSize();
@@ -3688,10 +3687,7 @@ SDValue MipsTargetLowering::LowerCallResult(
   MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                      *DAG.getContext());
 
-  const ExternalSymbolSDNode *ES =
-      dyn_cast_or_null<const ExternalSymbolSDNode>(CLI.Callee.getNode());
-  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.RetTy,
-                           ES ? ES->getSymbol() : nullptr);
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -3969,7 +3965,7 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                    LLVMContext &Context, const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
-  return CCInfo.CheckCallReturn(Outs, RetCC_Mips, RetTy);
+  return CCInfo.CheckReturn(Outs, RetCC_Mips);
 }
 
 bool MipsTargetLowering::shouldSignExtendTypeInLibCall(Type *Ty,
@@ -4408,7 +4404,7 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   case 'K': // unsigned 16 bit immediate
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       EVT Type = Op.getValueType();
-      uint64_t Val = (uint64_t)C->getZExtValue();
+      uint64_t Val = C->getZExtValue();
       if (isUInt<16>(Val)) {
         Result = DAG.getTargetConstant(Val, DL, Type);
         break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 38912a7..0c581dcc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1458,7 +1458,6 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   // Map the global virtual register number to a register class specific
   // virtual register number starting from 1 with that class.
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  //unsigned numRegClasses = TRI->getNumRegClasses();
 
   // Emit the Fake Stack Object
   const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1479,13 +1478,12 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   // global virtual
   // register number and the per class virtual register number.
   // We use the per class virtual register number in the ptx output.
-  unsigned int numVRs = MRI->getNumVirtRegs();
-  for (unsigned i = 0; i < numVRs; i++) {
-    Register vr = Register::index2VirtReg(i);
-    const TargetRegisterClass *RC = MRI->getRegClass(vr);
-    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
-    int n = regmap.size();
-    regmap.insert(std::make_pair(vr, n + 1));
+  for (unsigned I : llvm::seq(MRI->getNumVirtRegs())) {
+    Register VR = Register::index2VirtReg(I);
+    if (MRI->use_empty(VR) && MRI->def_empty(VR))
+      continue;
+    auto &RCRegMap = VRegMapping[MRI->getRegClass(VR)];
+    RCRegMap[VR] = RCRegMap.size() + 1;
   }
 
   // Emit declaration of the virtual registers or 'physical' registers for
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6068035..520ce4d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -70,7 +70,7 @@ NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
 }
 
 bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
-  return Subtarget->getTargetLowering()->usePrecSqrtF32(*MF, N);
+  return Subtarget->getTargetLowering()->usePrecSqrtF32(N);
 }
 
 bool NVPTXDAGToDAGISel::useF32FTZ() const {
@@ -82,11 +82,6 @@ bool NVPTXDAGToDAGISel::allowFMA() const {
   return TL->allowFMA(*MF, OptLevel);
 }
 
-bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
-  const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
-  return TL->allowUnsafeFPMath(*MF);
-}
-
 bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
 
 /// Select - Select instructions not customized! Used for
@@ -1027,6 +1022,72 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
   }
 }
 
+static inline bool isAddLike(const SDValue V) {
+  return V.getOpcode() == ISD::ADD ||
+         (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
+}
+
+static SDValue stripAssertAlign(SDValue N) {
+  if (N.getOpcode() == ISD::AssertAlign)
+    N = N.getOperand(0);
+  return N;
+}
+
+// selectBaseADDR - Match a dag node which will serve as the base address for an
+// ADDR operand pair.
+static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
+  N = stripAssertAlign(N);
+  if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
+    return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
+                                       GA->getValueType(0), GA->getOffset(),
+                                       GA->getTargetFlags());
+  if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
+    return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
+                                        ES->getTargetFlags());
+  if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
+    return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
+
+  return N;
+}
+
+static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
+  Addr = stripAssertAlign(Addr);
+  APInt AccumulatedOffset(64u, 0);
+  while (isAddLike(Addr)) {
+    const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (!CN)
+      break;
+
+    const APInt CI = CN->getAPIntValue().sext(64);
+    if (!(CI + AccumulatedOffset).isSignedIntN(32))
+      break;
+
+    AccumulatedOffset += CI;
+    Addr = stripAssertAlign(Addr->getOperand(0));
+  }
+  return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
+                                      MVT::i32);
+}
+
+static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) {
+  SDValue Offset = accumulateOffset(Addr, SDLoc(Addr), DAG);
+  SDValue Base = selectBaseADDR(Addr, DAG);
+  return {Base, Offset};
+}
+
+// Select a pair of operands which represent a valid PTX address, this could be
+// one of the following things:
+//  - [var] - Offset is simply set to 0
+//  - [reg] - Offset is simply set to 0
+//  - [reg+immOff]
+//  - [var+immOff]
+// Note that immOff must fit into a 32-bit signed integer.
+bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
+                                   SDValue &Offset) {
+  std::tie(Base, Offset) = selectADDR(Addr, CurDAG);
+  return true;
+}
+
 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   MemSDNode *LD = cast<MemSDNode>(N);
   assert(LD->readMem() && "Expected load");
@@ -1062,8 +1123,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
          FromTypeWidth <= 128 && "Invalid width for load");
 
   // Create the machine instruction DAG
-  SDValue Offset, Base;
-  SelectADDR(N->getOperand(1), Base, Offset);
+  const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
   SDValue Ops[] = {getI32Imm(Ordering, DL),
                    getI32Imm(Scope, DL),
                    getI32Imm(CodeAddrSpace, DL),
@@ -1144,8 +1204,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
          FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
 
-  SDValue Offset, Base;
-  SelectADDR(N->getOperand(1), Base, Offset);
+  const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
   SDValue Ops[] = {getI32Imm(Ordering, DL),
                    getI32Imm(Scope, DL),
                    getI32Imm(CodeAddrSpace, DL),
@@ -1213,8 +1272,7 @@ bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
   assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
          FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
 
-  SDValue Base, Offset;
-  SelectADDR(LD->getOperand(1), Base, Offset);
+  const auto [Base, Offset] = selectADDR(LD->getOperand(1), CurDAG);
   SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base,
                    Offset, LD->getChain()};
 
@@ -1278,8 +1336,7 @@ bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
   SDValue Addr =
       LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
 
-  SDValue Base, Offset;
-  SelectADDR(Addr, Base, Offset);
+  const auto [Base, Offset] = selectADDR(Addr, CurDAG);
   SDValue Ops[] = {getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()};
 
   std::optional<unsigned> Opcode;
@@ -1339,9 +1396,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
          "Invalid width for store");
 
-  SDValue Offset, Base;
-  SelectADDR(ST->getBasePtr(), Base, Offset);
-
+  const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG);
   SDValue Ops[] = {selectPossiblyImm(Value),
                    getI32Imm(Ordering, DL),
                    getI32Imm(Scope, DL),
@@ -1399,9 +1454,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
          TotalWidth <= 256 && "Invalid width for store");
 
-  SDValue Offset, Base;
-  SelectADDR(Addr, Base, Offset);
-
+  const auto [Base, Offset] = selectADDR(Addr, CurDAG);
   Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
               getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base,
               Offset, Chain});
@@ -1708,59 +1761,6 @@ bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
   return true;
 }
 
-static inline bool isAddLike(const SDValue V) {
-  return V.getOpcode() == ISD::ADD ||
-         (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
-}
-
-// selectBaseADDR - Match a dag node which will serve as the base address for an
-// ADDR operand pair.
-static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
-  if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
-    return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
-                                       GA->getValueType(0), GA->getOffset(),
-                                       GA->getTargetFlags());
-  if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
-    return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
-                                        ES->getTargetFlags());
-  if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
-    return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
-
-  return N;
-}
-
-static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
-  APInt AccumulatedOffset(64u, 0);
-  while (isAddLike(Addr)) {
-    const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    if (!CN)
-      break;
-
-    const APInt CI = CN->getAPIntValue().sext(64);
-    if (!(CI + AccumulatedOffset).isSignedIntN(32))
-      break;
-
-    AccumulatedOffset += CI;
-    Addr = Addr->getOperand(0);
-  }
-  return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
-                                      MVT::i32);
-}
-
-// Select a pair of operands which represent a valid PTX address, this could be
-// one of the following things:
-//  - [var] - Offset is simply set to 0
-//  - [reg] - Offset is simply set to 0
-//  - [reg+immOff]
-//  - [var+immOff]
-// Note that immOff must fit into a 32-bit signed integer.
-bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
-                                   SDValue &Offset) {
-  Offset = accumulateOffset(Addr, SDLoc(Addr), CurDAG);
-  Base = selectBaseADDR(Addr, CurDAG);
-  return true;
-}
-
 SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
   if (V.getOpcode() == ISD::BITCAST)
     V = V.getOperand(0);
@@ -1774,37 +1774,20 @@ SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
   return V;
 }
 
-bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
-                                                 unsigned int spN) const {
-  const Value *Src = nullptr;
-  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
-    if (spN == 0 && mN->getMemOperand()->getPseudoValue())
-      return true;
-    Src = mN->getMemOperand()->getValue();
-  }
-  if (!Src)
-    return false;
-  if (auto *PT = dyn_cast<PointerType>(Src->getType()))
-    return (PT->getAddressSpace() == spN);
-  return false;
-}
-
 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
 /// inline asm expressions.
 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
     const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
     std::vector<SDValue> &OutOps) {
-  SDValue Op0, Op1;
   switch (ConstraintID) {
   default:
     return true;
-  case InlineAsm::ConstraintCode::m: // memory
-    if (SelectADDR(Op, Op0, Op1)) {
-      OutOps.push_back(Op0);
-      OutOps.push_back(Op1);
-      return false;
-    }
-    break;
+  case InlineAsm::ConstraintCode::m: { // memory
+    const auto [Base, Offset] = selectADDR(Op, CurDAG);
+    OutOps.push_back(Base);
+    OutOps.push_back(Offset);
+    return false;
+  }
   }
   return true;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 9e0f88e5..6573172 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -44,7 +44,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool usePrecSqrtF32(const SDNode *N) const;
   bool useF32FTZ() const;
   bool allowFMA() const;
-  bool allowUnsafeFPMath() const;
   bool doRsqrtOpt() const;
 
   NVPTXScopes Scopes{};
@@ -102,8 +101,6 @@ private:
   SDValue getPTXCmpMode(const CondCodeSDNode &CondCode);
   SDValue selectPossiblyImm(SDValue V);
 
-  bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
-
   // Returns the Memory Order and Scope that the PTX memory instruction should
   // use, and inserts appropriate fence instruction before the memory
   // instruction, if needed to implement the instructions memory order. Required
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d4f0cc9..74e6c13 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -125,10 +124,6 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
   if (UsePrecDivF32.getNumOccurrences() > 0)
     return UsePrecDivF32;
 
-  // Otherwise, use div.approx if fast math is enabled
-  if (allowUnsafeFPMath(MF))
-    return NVPTX::DivPrecisionLevel::Approx;
-
   const SDNodeFlags Flags = N.getFlags();
   if (Flags.hasApproximateFuncs())
     return NVPTX::DivPrecisionLevel::Approx;
@@ -136,16 +131,11 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
   return NVPTX::DivPrecisionLevel::IEEE754;
 }
 
-bool NVPTXTargetLowering::usePrecSqrtF32(const MachineFunction &MF,
-                                         const SDNode *N) const {
+bool NVPTXTargetLowering::usePrecSqrtF32(const SDNode *N) const {
   // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
   if (UsePrecSqrtF32.getNumOccurrences() > 0)
     return UsePrecSqrtF32;
 
-  // Otherwise, use sqrt.approx if fast math is enabled
-  if (allowUnsafeFPMath(MF))
-    return false;
-
   if (N) {
     const SDNodeFlags Flags = N->getFlags();
     if (Flags.hasApproximateFuncs())
@@ -680,6 +670,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // No support for these operations with v2f32.
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand);
+  // Need custom lowering in case the index is dynamic.
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 
   // Custom conversions to/from v2i8.
   setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
@@ -1191,8 +1183,7 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
                                              bool &UseOneConst,
                                              bool Reciprocal) const {
   if (!(Enabled == ReciprocalEstimate::Enabled ||
-        (Enabled == ReciprocalEstimate::Unspecified &&
-         !usePrecSqrtF32(DAG.getMachineFunction()))))
+        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
     return SDValue();
 
   if (ExtraSteps == ReciprocalEstimate::Unspecified)
@@ -2849,8 +2840,7 @@ static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) {
                      SDLoc(Op), Opcode, DAG);
 }
 
-static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
-                         bool AllowUnsafeFPMath) {
+static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG) {
   // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
   // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
   // the semantics of LLVM's frem.
@@ -2867,7 +2857,7 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
   SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
                             Flags | SDNodeFlags::AllowContract);
 
-  if (AllowUnsafeFPMath || Flags.hasNoInfs())
+  if (Flags.hasNoInfs())
     return Sub;
 
   // If Y is infinite, return X
@@ -3012,7 +3002,7 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ:
     return lowerCTLZCTPOP(Op, DAG);
   case ISD::FREM:
-    return lowerFREM(Op, DAG, allowUnsafeFPMath(DAG.getMachineFunction()));
+    return lowerFREM(Op, DAG);
 
   default:
     llvm_unreachable("Custom lowering not defined for operation");
@@ -4866,17 +4856,7 @@ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
   if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
     return true;
 
-  return allowUnsafeFPMath(MF);
-}
-
-bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const {
-  // Honor TargetOptions flags that explicitly say unsafe math is okay.
-  if (MF.getTarget().Options.UnsafeFPMath)
-    return true;
-
-  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
-  const Function &F = MF.getFunction();
-  return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
+  return false;
 }
 
 static bool isConstZero(const SDValue &Operand) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 43e721a..27f099e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -206,8 +206,7 @@ public:
 
   // Get whether we should use a precise or approximate 32-bit floating point
   // sqrt instruction.
-  bool usePrecSqrtF32(const MachineFunction &MF,
-                      const SDNode *N = nullptr) const;
+  bool usePrecSqrtF32(const SDNode *N = nullptr) const;
 
   // Get whether we should use instructions that flush floating-point denormals
   // to sign-preserving zero.
@@ -220,7 +219,6 @@ public:
   unsigned combineRepeatedFPDivisors() const override { return 2; }
 
   bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const;
-  bool allowUnsafeFPMath(const MachineFunction &MF) const;
 
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT) const override {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 1ab41bf..7b13509 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -148,13 +148,16 @@ class OneUse2<SDPatternOperator operator>
     : PatFrag<(ops node:$A, node:$B), (operator node:$A, node:$B), [{ return N->hasOneUse(); }]>;
 
 
-class fpimm_pos_inf<ValueType vt>
-    : FPImmLeaf<vt, [{ return Imm.isPosInfinity(); }]>;
-
 class zeroinitializer<ValueType vt> : 
   PatLeaf<(vt (bitconvert (!cast<ValueType>("i" # vt.Size) 0)))>;
 
 
+def fpimm_pos_inf : FPImmLeaf<fAny, [{ return Imm.isPosInfinity(); }]>;
+def fpimm_0 : FPImmLeaf<fAny, [{ return Imm.isZero(); }]>;
+def fpimm_1 : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(1.0); }]>;
+def fpimm_neg_1 : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(-1.0); }]>;
+
+
 // Operands which can hold a Register or an Immediate.
 //
 // Unfortunately, since most register classes can hold multiple types, we must
@@ -268,7 +271,7 @@ multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
 // The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
 multiclass I3<string op_str, SDPatternOperator op_node, bit commutative> {
   foreach t = [I16RT, I32RT, I64RT] in
-    defm t.Ty# : I3Inst<op_str # t.Size, op_node, t, commutative>;
+    defm t.Size# : I3Inst<op_str # t.Size, op_node, t, commutative>;
 }
 
 class I16x2<string OpcStr, SDNode OpNode> :
@@ -761,10 +764,10 @@ def fabs_oneuse : OneUse1<fabs>;
 
 def TESTINF_f32r : BasicNVPTXInst<(outs B1:$p), (ins B32:$a),
                              "testp.infinite.f32",
-                             [(set i1:$p, (seteq (fabs_oneuse f32:$a), fpimm_pos_inf<f32>))]>;
+                             [(set i1:$p, (seteq (fabs_oneuse f32:$a), fpimm_pos_inf))]>;
 def TESTINF_f64r : BasicNVPTXInst<(outs B1:$p), (ins B64:$a),
                              "testp.infinite.f64",
-                             [(set i1:$p, (seteq (fabs_oneuse f64:$a), fpimm_pos_inf<f64>))]>;
+                             [(set i1:$p, (seteq (fabs_oneuse f64:$a), fpimm_pos_inf))]>;
 
 //-----------------------------------
 // Integer Arithmetic
@@ -787,8 +790,8 @@ defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>;
 
 defm MULT : I3<"mul.lo.s", mul, commutative = true>;
 
-defm MULTHS : I3<"mul.hi.s", mulhs, commutative = true>;
-defm MULTHU : I3<"mul.hi.u", mulhu, commutative = true>;
+defm MUL_HI_S : I3<"mul.hi.s", mulhs, commutative = true>;
+defm MUL_HI_U : I3<"mul.hi.u", mulhu, commutative = true>;
 
 defm SDIV : I3<"div.s", sdiv, commutative = false>;
 defm UDIV : I3<"div.u", udiv, commutative = false>;
@@ -905,22 +908,6 @@ let Predicates = [hasOptEnabled] in {
 // Floating Point Arithmetic
 //-----------------------------------
 
-// Constant 1.0f
-def f32imm_1 : FPImmLeaf<f32, [{
-  return &Imm.getSemantics() == &llvm::APFloat::IEEEsingle() &&
-         Imm.convertToFloat() == 1.0f;
-}]>;
-// Constant 1.0 (double)
-def f64imm_1 : FPImmLeaf<f64, [{
-  return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() &&
-         Imm.convertToDouble() == 1.0;
-}]>;
-// Constant -1.0 (double)
-def f64imm_neg1 : FPImmLeaf<f64, [{
-  return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() &&
-         Imm.convertToDouble() == -1.0;
-}]>;
-
 defm FADD : F3_fma_component<"add", fadd>;
 defm FSUB : F3_fma_component<"sub", fsub>;
 defm FMUL : F3_fma_component<"mul", fmul>;
@@ -994,7 +981,7 @@ def FRCP64r :
   BasicNVPTXInst<(outs B64:$dst),
                  (ins B64:$b),
                  "rcp.rn.f64",
-                 [(set f64:$dst, (fdiv f64imm_1, f64:$b))]>;
+                 [(set f64:$dst, (fdiv fpimm_1, f64:$b))]>;
 def FDIV64rr :
   BasicNVPTXInst<(outs B64:$dst),
                  (ins B64:$a, B64:$b),
@@ -1008,7 +995,7 @@ def FDIV64ri :
 
 // fdiv will be converted to rcp
 // fneg (fdiv 1.0, X) => fneg (rcp.rn X)
-def : Pat<(fdiv f64imm_neg1, f64:$b),
+def : Pat<(fdiv fpimm_neg_1, f64:$b),
           (FNEGf64 (FRCP64r $b))>;
 
 //
@@ -1021,21 +1008,21 @@ def fdiv_approx : PatFrag<(ops node:$a, node:$b),
 }]>;
 
 
-def FRCP32_approx_r :
+def RCP_APPROX_F32_r :
   BasicFlagsNVPTXInst<(outs B32:$dst),
                  (ins B32:$b), (ins FTZFlag:$ftz),
                  "rcp.approx$ftz.f32",
-                 [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>;
+                 [(set f32:$dst, (fdiv_approx fpimm_1, f32:$b))]>;
 
 //
 // F32 Approximate division
 //
-def FDIV32_approx_rr :
+def DIV_APPROX_F32_rr :
   BasicFlagsNVPTXInst<(outs B32:$dst),
                  (ins B32:$a, B32:$b), (ins FTZFlag:$ftz),
                  "div.approx$ftz.f32",
                  [(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>;
-def FDIV32_approx_ri :
+def DIV_APPROX_F32_ri :
   BasicFlagsNVPTXInst<(outs B32:$dst),
                  (ins B32:$a, f32imm:$b), (ins FTZFlag:$ftz),
                  "div.approx$ftz.f32",
@@ -1052,8 +1039,8 @@ def fdiv_full : PatFrag<(ops node:$a, node:$b),
 }]>;
 
 
-def : Pat<(fdiv_full f32imm_1, f32:$b),
-          (FRCP32_approx_r $b)>;
+def : Pat<(fdiv_full fpimm_1, f32:$b),
+          (RCP_APPROX_F32_r $b)>;
 
 //
 // F32 Semi-accurate division
@@ -1081,7 +1068,7 @@ def FRCP32r_prec :
   BasicFlagsNVPTXInst<(outs B32:$dst),
                  (ins B32:$b), (ins FTZFlag:$ftz),
                  "rcp.rn$ftz.f32",
-                 [(set f32:$dst, (fdiv_ftz f32imm_1, f32:$b))]>;
+                 [(set f32:$dst, (fdiv_ftz fpimm_1, f32:$b))]>;
 //
 // F32 Accurate division
 //
@@ -1096,7 +1083,7 @@ def FDIV32ri_prec :
                  "div.rn$ftz.f32",
                  [(set f32:$dst, (fdiv_ftz f32:$a, fpimm:$b))]>;
 
-def : Pat<(fdiv f32imm_1, f32:$b), (FRCP32r_prec $b, NoFTZ)>;
+def : Pat<(fdiv fpimm_1, f32:$b), (FRCP32r_prec $b, NoFTZ)>;
 def : Pat<(fdiv f32:$a, f32:$b), (FDIV32rr_prec $a, $b, NoFTZ)>;
 def : Pat<(fdiv f32:$a, fpimm:$b), (FDIV32ri_prec $a, fpimm:$b, NoFTZ)>;
 
@@ -1146,9 +1133,8 @@ defm FMA_F64    : FMA<F64RT,    allow_ftz = false>;
 // sin/cos/tanh
 
 class UnaryOpAllowsApproxFn<SDPatternOperator operator>
-    : PatFrag<(ops node:$A),
-              (operator node:$A), [{
-  return allowUnsafeFPMath() || N->getFlags().hasApproximateFuncs();
+    : PatFrag<(ops node:$A), (operator node:$A), [{
+  return N->getFlags().hasApproximateFuncs();
 }]>;
 
 def SIN_APPROX_f32 :
@@ -1519,23 +1505,28 @@ def MmaCode : Operand<i32> {
 // Get pointer to local stack.
 let hasSideEffects = false in {
   def MOV_DEPOT_ADDR :    NVPTXInst<(outs B32:$d), (ins i32imm:$num),
-                                     "mov.b32 \t$d, __local_depot$num;", []>;
+                                     "mov.b32 \t$d, __local_depot$num;">;
   def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs B64:$d), (ins i32imm:$num),
-                                    "mov.b64 \t$d, __local_depot$num;", []>;
+                                    "mov.b64 \t$d, __local_depot$num;">;
 }
 
-
-// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
-let hasSideEffects = false, isAsCheapAsAMove = true in {
-  let isMoveReg = true in
+let hasSideEffects = false in {
+  let isMoveReg = true, isAsCheapAsAMove = true in
     class MOVr<RegisterClass RC, string OpStr> :
       BasicNVPTXInst<(outs RC:$dst), (ins RC:$src), "mov." # OpStr>;
 
-  let isMoveImm = true in
+  let isMoveImm = true, isAsCheapAsAMove = true in
     class MOVi<RegTyInfo t, string suffix> :
       BasicNVPTXInst<(outs t.RC:$dst), (ins t.Imm:$src),
               "mov." # suffix,
               [(set t.Ty:$dst, t.ImmNode:$src)]>;
+
+  // We don't want to set isAsCheapAsAMove to true for these instructions as
+  // this would prevent CSE and resulted in regressions (see discussion after
+  // PR-145581 in llvm-project).
+  class MovSymInst<RegTyInfo t> :
+    BasicNVPTXInst<(outs t.RC:$dst), (ins Operand<t.Ty>:$src),
+                   "mov.b" # t.Size>;
 }
 
 def MOV_B1_r : MOVr<B1, "pred">;
@@ -1553,6 +1544,9 @@ def MOV_BF16_i : MOVi<BF16RT, "b16">;
 def MOV_F32_i  : MOVi<F32RT, "b32">;
 def MOV_F64_i  : MOVi<F64RT, "b64">;
 
+def MOV_B32_sym : MovSymInst<I32RT>;
+def MOV_B64_sym : MovSymInst<I64RT>;
+
 
 def to_tglobaladdr : SDNodeXForm<globaladdr, [{
   return CurDAG->getTargetGlobalAddress(N->getGlobal(), SDLoc(N),
@@ -1569,17 +1563,17 @@ def to_tframeindex : SDNodeXForm<frameindex, [{
   return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0));
 }]>;
 
-def : Pat<(i32 globaladdr:$dst), (MOV_B32_i (to_tglobaladdr $dst))>;
-def : Pat<(i64 globaladdr:$dst), (MOV_B64_i (to_tglobaladdr $dst))>;
+def : Pat<(i32 globaladdr:$dst), (MOV_B32_sym (to_tglobaladdr $dst))>;
+def : Pat<(i64 globaladdr:$dst), (MOV_B64_sym (to_tglobaladdr $dst))>;
 
-def : Pat<(i32 externalsym:$dst), (MOV_B32_i (to_texternsym $dst))>;
-def : Pat<(i64 externalsym:$dst), (MOV_B64_i (to_texternsym $dst))>;
+def : Pat<(i32 externalsym:$dst), (MOV_B32_sym (to_texternsym $dst))>;
+def : Pat<(i64 externalsym:$dst), (MOV_B64_sym (to_texternsym $dst))>;
 
 //---- Copy Frame Index ----
 def LEA_ADDRi :   NVPTXInst<(outs B32:$dst), (ins ADDR:$addr),
-                            "add.u32 \t$dst, ${addr:add};", []>;
+                            "add.u32 \t$dst, ${addr:add};">;
 def LEA_ADDRi64 : NVPTXInst<(outs B64:$dst), (ins ADDR:$addr),
-                            "add.u64 \t$dst, ${addr:add};", []>;
+                            "add.u64 \t$dst, ${addr:add};">;
 
 def : Pat<(i32 frameindex:$fi), (LEA_ADDRi (to_tframeindex $fi), 0)>;
 def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>;
@@ -1644,12 +1638,12 @@ foreach is_convergent = [0, 1] in {
       NVPTXInst<(outs),
                 (ins ADDR_base:$addr, CallOperand:$rets, CallOperand:$params, 
                      i32imm:$proto),
-                "call${rets:RetList} $addr, (${params:ParamList}), prototype_$proto;", []>;
+                "call${rets:RetList} $addr, (${params:ParamList}), prototype_$proto;">;
 
     def CALL_UNI # convergent_suffix :
       NVPTXInst<(outs),
                 (ins ADDR_base:$addr, CallOperand:$rets, CallOperand:$params),
-                "call.uni${rets:RetList} $addr, (${params:ParamList});", []>;
+                "call.uni${rets:RetList} $addr, (${params:ParamList});">;
   }
 
   defvar call_inst = !cast<NVPTXInst>("CALL" # convergent_suffix);
@@ -1665,10 +1659,10 @@ foreach is_convergent = [0, 1] in {
 
 def DECLARE_PARAM_array :
   NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size),
-            ".param .align $align .b8 \t$a[$size];", []>;
+            ".param .align $align .b8 \t$a[$size];">;
 def DECLARE_PARAM_scalar :
   NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
-            ".param .b$size \t$a;", []>;
+            ".param .b$size \t$a;">;
 
 def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size),
           (DECLARE_PARAM_array (to_texternsym $a), imm:$align, imm:$size)>;
@@ -1741,7 +1735,7 @@ class LD<NVPTXRegClass regclass>
     (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign,
          i32imm:$fromWidth, ADDR:$addr),
     "ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth "
-    "\t$dst, [$addr];", []>;
+    "\t$dst, [$addr];">;
 
 let mayLoad=1, hasSideEffects=0 in {
   def LD_i16 : LD<B16>;
@@ -1756,7 +1750,7 @@ class ST<DAGOperand O>
          AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$toWidth,
          ADDR:$addr),
     "st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth"
-    " \t[$addr], $src;", []>;
+    " \t[$addr], $src;">;
 
 let mayStore=1, hasSideEffects=0 in {
   def ST_i16 : ST<RI16>;
@@ -1773,13 +1767,13 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
     (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
          AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
     "ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2}}, [$addr];", []>;
+    "\t{{$dst1, $dst2}}, [$addr];">;
   def _v4 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
     (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
          AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
     "ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];">;
   if support_v8 then
     def _v8 : NVPTXInst<
       (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
@@ -1788,7 +1782,7 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
            i32imm:$fromWidth, ADDR:$addr),
       "ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth "
       "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
-      "[$addr];", []>;
+      "[$addr];">;
 }
 let mayLoad=1, hasSideEffects=0 in {
   defm LDV_i16 : LD_VEC<B16>;
@@ -1803,14 +1797,14 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
          AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
          ADDR:$addr),
     "st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth "
-    "\t[$addr], {{$src1, $src2}};", []>;
+    "\t[$addr], {{$src1, $src2}};">;
   def _v4 : NVPTXInst<
     (outs),
     (ins O:$src1, O:$src2, O:$src3, O:$src4,
          AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
          ADDR:$addr),
     "st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth "
-    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};">;
   if support_v8 then
     def _v8 : NVPTXInst<
       (outs),
@@ -1820,7 +1814,7 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
            ADDR:$addr),
       "st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth "
       "\t[$addr], "
-      "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};", []>;
+      "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};">;
 }
 
 let mayStore=1, hasSideEffects=0 in {
@@ -2015,60 +2009,52 @@ let hasSideEffects = false in {
   def V4I16toI64 : NVPTXInst<(outs B64:$d),
                              (ins B16:$s1, B16:$s2,
                                   B16:$s3, B16:$s4),
-                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
+                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};">;
   def V2I16toI32 : NVPTXInst<(outs B32:$d),
                              (ins B16:$s1, B16:$s2),
-                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
+                             "mov.b32 \t$d, {{$s1, $s2}};">;
   def V2I32toI64 : NVPTXInst<(outs B64:$d),
                              (ins B32:$s1, B32:$s2),
-                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
+                             "mov.b64 \t$d, {{$s1, $s2}};">;
   def V2I64toI128 : NVPTXInst<(outs B128:$d),
                               (ins B64:$s1, B64:$s2),
-                              "mov.b128 \t$d, {{$s1, $s2}};", []>;
+                              "mov.b128 \t$d, {{$s1, $s2}};">;
 
   // unpack a larger int register to a set of smaller int registers
   def I64toV4I16 : NVPTXInst<(outs B16:$d1, B16:$d2,
                                    B16:$d3, B16:$d4),
                              (ins B64:$s),
-                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;">;
   def I32toV2I16 : NVPTXInst<(outs B16:$d1, B16:$d2),
                              (ins B32:$s),
-                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
+                             "mov.b32 \t{{$d1, $d2}}, $s;">;
   def I64toV2I32 : NVPTXInst<(outs B32:$d1, B32:$d2),
                              (ins B64:$s),
-                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
+                             "mov.b64 \t{{$d1, $d2}}, $s;">;
   def I128toV2I64: NVPTXInst<(outs B64:$d1, B64:$d2),
                               (ins B128:$s),
-                              "mov.b128 \t{{$d1, $d2}}, $s;", []>;
+                              "mov.b128 \t{{$d1, $d2}}, $s;">;
 
-  def I32toI16H  : NVPTXInst<(outs B16:$high),
-                             (ins B32:$s),
-                             "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
-                             []>;
-  def I32toI16L  : NVPTXInst<(outs B16:$low),
-                             (ins B32:$s),
-                             "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}",
-                             []>;
-  def I64toI32H  : NVPTXInst<(outs B32:$high),
-                             (ins B64:$s),
-                             "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
-                             []>;
-  def I64toI32L  : NVPTXInst<(outs B32:$low),
-                             (ins B64:$s),
-                             "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
-                             []>;
+  def I32toI16H  : NVPTXInst<(outs B16:$high), (ins B32:$s),
+                             "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}">;
+  def I32toI16L  : NVPTXInst<(outs B16:$low), (ins B32:$s),
+                             "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}">;
+  def I64toI32H  : NVPTXInst<(outs B32:$high), (ins B64:$s),
+                            "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}">;
+  def I64toI32L  : NVPTXInst<(outs B32:$low), (ins B64:$s),
+                             "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}">;
 
   // PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
   // unused high/low part.
   let Predicates = [hasPTX<71>] in {
     def I32toI16H_Sink  : NVPTXInst<(outs B16:$high), (ins B32:$s),
-                              "mov.b32 \t{{_, $high}}, $s;", []>;
+                              "mov.b32 \t{{_, $high}}, $s;">;
     def I32toI16L_Sink  : NVPTXInst<(outs B16:$low), (ins B32:$s),
-                              "mov.b32 \t{{$low, _}}, $s;", []>;
+                              "mov.b32 \t{{$low, _}}, $s;">;
     def I64toI32H_Sink  : NVPTXInst<(outs B32:$high), (ins B64:$s),
-                              "mov.b64 \t{{_, $high}}, $s;", []>;
+                              "mov.b64 \t{{_, $high}}, $s;">;
     def I64toI32L_Sink  : NVPTXInst<(outs B32:$low), (ins B64:$s),
-                              "mov.b64 \t{{$low, _}}, $s;", []>;
+                              "mov.b64 \t{{$low, _}}, $s;">;
   }
 }
 
@@ -2426,10 +2412,6 @@ foreach scope = ["sys", "gpu", "cluster", "cta"] in {
   def atomic_thread_fence_release_#scope: NVPTXFenceInst<scope, "release", hasPTX<87>>;
 }
 
-def fpimm_any_zero : FPImmLeaf<fAny, [{
-  return Imm.isZero();
-}]>;
-
 // Perform substitution if fma only has one use, and also if instruction has
 // nnan instruction flag or if the TM has NoNaNsFPMath
 def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c),
@@ -2451,11 +2433,11 @@ class FMARELUInst<RegTyInfo t, bit allow_ftz, PatFrag zero_pat>
                    [(set t.Ty:$dst, (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan t.Ty:$a, t.Ty:$b, t.Ty:$c), zero_pat))]>;
 
 let Predicates = [useFP16Math, hasPTX<70>, hasSM<80>] in {
-  def FMARELU_F16 : FMARELUInst<F16RT, true, fpimm_any_zero>;
+  def FMARELU_F16 : FMARELUInst<F16RT, true, fpimm_0>;
   def FMARELU_F16X2 : FMARELUInst<F16X2RT, true, zeroinitializer<v2f16>>;
 }
 
 let Predicates = [hasBF16Math, hasPTX<70>, hasSM<80>] in {
-  def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_any_zero>;
+  def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_0>;
   def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, zeroinitializer<v2bf16>>;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d4a0ca7..721afae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6,44 +6,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-def immFloat0 : PatLeaf<(fpimm), [{
-    float f = (float)N->getValueAPF().convertToFloat();
-    return (f==0.0f);
-}]>;
-
-def immFloat1 : PatLeaf<(fpimm), [{
-    float f = (float)N->getValueAPF().convertToFloat();
-    return (f==1.0f);
-}]>;
-
-def immDouble0 : PatLeaf<(fpimm), [{
-    double d = (double)N->getValueAPF().convertToDouble();
-    return (d==0.0);
-}]>;
-
-def immDouble1 : PatLeaf<(fpimm), [{
-    double d = (double)N->getValueAPF().convertToDouble();
-    return (d==1.0);
-}]>;
-
 def AS_match {
   code generic = [{
-   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+   return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC;
   }];
   code shared = [{
-   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+   return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_SHARED;
   }];
   code shared_cluster = [{
-   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED_CLUSTER);
+   return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_SHARED_CLUSTER;
   }];
   code global = [{
-   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+   return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL;
   }];
   code const = [{
-   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST);
+   return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_CONST;
   }];
   code param = [{
-   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM);
+   return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_PARAM;
   }];
 }
 
@@ -659,22 +639,22 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode>
 
   def "" : NVPTXInst<(outs),
             !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)),
-            !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";"), []>,
+            !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>,
             Requires<[hasPTX<80>, hasSM<90>]>;
   def _MC : NVPTXInst<(outs),
                   !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
                        (ins B16:$mc, CTAGroupFlags:$cg)),
-                  !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;"), []>,
+                  !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
   def _CH : NVPTXInst<(outs),
                   !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
                        (ins B64:$ch, CTAGroupFlags:$cg)),
-                  !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;"), []>,
+                  !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
   def _MC_CH : NVPTXInst<(outs),
                      !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
                           (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)),
-                     !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;"), []>,
+                     !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>,
                      Requires<[hasPTX<80>, hasSM<90>]>;
 }
 
@@ -876,11 +856,11 @@ multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, bit shared32, string mode>
 
   def "" : NVPTXInst<(outs),
             !con((ins rc:$src, B64:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)),
-            !strconcat(prefix, "${red_op}", suffix, asm_str, ";"), []>,
+            !strconcat(prefix, "${red_op}", suffix, asm_str, ";")>,
             Requires<[hasPTX<80>, hasSM<90>]>;
   def _CH : NVPTXInst<(outs),
                   !con((ins rc:$src, B64:$tmap), dims_dag, (ins B64:$ch, TMAReductionFlags:$red_op)),
-                  !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;"), []>,
+                  !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;")>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
 }
 
@@ -1112,30 +1092,30 @@ let Predicates = [hasPTX<70>, hasSM<80>] in {
 // max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
 // Same story for fmax, fmin.
 
-def : Pat<(int_nvvm_fmin_f immFloat1,
-            (int_nvvm_fmax_f immFloat0, f32:$a)),
+def : Pat<(int_nvvm_fmin_f fpimm_1,
+            (int_nvvm_fmax_f fpimm_0, f32:$a)),
           (CVT_f32_f32 $a, CvtSAT)>;
-def : Pat<(int_nvvm_fmin_f immFloat1,
-            (int_nvvm_fmax_f f32:$a, immFloat0)),
+def : Pat<(int_nvvm_fmin_f fpimm_1,
+            (int_nvvm_fmax_f f32:$a, fpimm_0)),
           (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_f
-            (int_nvvm_fmax_f immFloat0, f32:$a), immFloat1),
+            (int_nvvm_fmax_f fpimm_0, f32:$a), fpimm_1),
           (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_f
-            (int_nvvm_fmax_f f32:$a, immFloat0), immFloat1),
+            (int_nvvm_fmax_f f32:$a, fpimm_0), fpimm_1),
           (CVT_f32_f32 $a, CvtSAT)>;
 
-def : Pat<(int_nvvm_fmin_d immDouble1,
-            (int_nvvm_fmax_d immDouble0, f64:$a)),
+def : Pat<(int_nvvm_fmin_d fpimm_1,
+            (int_nvvm_fmax_d fpimm_0, f64:$a)),
           (CVT_f64_f64 $a, CvtSAT)>;
-def : Pat<(int_nvvm_fmin_d immDouble1,
-            (int_nvvm_fmax_d f64:$a, immDouble0)),
+def : Pat<(int_nvvm_fmin_d fpimm_1,
+            (int_nvvm_fmax_d f64:$a, fpimm_0)),
           (CVT_f64_f64 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_d
-            (int_nvvm_fmax_d immDouble0, f64:$a), immDouble1),
+            (int_nvvm_fmax_d fpimm_0, f64:$a), fpimm_1),
           (CVT_f64_f64 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_d
-            (int_nvvm_fmax_d f64:$a, immDouble0), immDouble1),
+            (int_nvvm_fmax_d f64:$a, fpimm_0), fpimm_1),
           (CVT_f64_f64 $a, CvtSAT)>;
 
 
@@ -1329,12 +1309,12 @@ defm INT_NVVM_FMAN : MIN_MAX<"max">;
 // Multiplication
 //
 
-def INT_NVVM_MULHI_S : F_MATH_2<"mul.hi.s16", B16, B16, B16, int_nvvm_mulhi_s>;
-def INT_NVVM_MULHI_US : F_MATH_2<"mul.hi.u16", B16, B16, B16, int_nvvm_mulhi_us>;
-def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32", B32, B32, B32, int_nvvm_mulhi_i>;
-def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32", B32, B32, B32, int_nvvm_mulhi_ui>;
-def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64", B64, B64, B64, int_nvvm_mulhi_ll>;
-def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64", B64, B64, B64, int_nvvm_mulhi_ull>;
+def : Pat<(int_nvvm_mulhi_s i16:$a, i16:$b), (MUL_HI_S16rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_us i16:$a, i16:$b), (MUL_HI_U16rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_i i32:$a, i32:$b), (MUL_HI_S32rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_ui i32:$a, i32:$b), (MUL_HI_U32rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_ll i64:$a, i64:$b), (MUL_HI_S64rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_ull i64:$a, i64:$b), (MUL_HI_U64rr $a, $b)>;
 
 def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32", B32, B32, B32, int_nvvm_mul_rn_ftz_f>;
 def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32", B32, B32, B32, int_nvvm_mul_rn_f>;
@@ -1357,8 +1337,8 @@ def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32", B32, B32, B32, int_nvvm_mul24_u
 // Div
 //
 
-def INT_NVVM_DIV_APPROX_FTZ_F : F_MATH_2<"div.approx.ftz.f32", B32, B32, B32, int_nvvm_div_approx_ftz_f>;
-def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32", B32, B32, B32, int_nvvm_div_approx_f>;
+def : Pat<(int_nvvm_div_approx_ftz_f f32:$a, f32:$b), (DIV_APPROX_F32_rr $a, $b, FTZ)>;
+def : Pat<(int_nvvm_div_approx_f f32:$a, f32:$b), (DIV_APPROX_F32_rr $a, $b, NoFTZ)>;
 
 def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32", B32, B32, B32, int_nvvm_div_rn_ftz_f>;
 def INT_NVVM_DIV_RN_F     : F_MATH_2<"div.rn.f32", B32, B32, B32, int_nvvm_div_rn_f>;
@@ -1663,13 +1643,13 @@ def : Pat<(int_nvvm_rsqrt_approx_d f64:$a), (RSQRT_APPROX_f64 $a, NoFTZ)>;
 
 // 1.0f / sqrt_approx -> rsqrt_approx
 let Predicates = [doRsqrtOpt] in {
-  def : Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_f f32:$a)),
+  def : Pat<(fdiv fpimm_1, (int_nvvm_sqrt_approx_f f32:$a)),
           (RSQRT_APPROX_f32 $a, NoFTZ)>;
-  def : Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
+  def : Pat<(fdiv fpimm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
           (RSQRT_APPROX_f32 $a, FTZ)>;
 
   // same for int_nvvm_sqrt_f when non-precision sqrt is requested
-  def : Pat<(fdiv f32imm_1, (fsqrt_approx f32:$a)),
+  def : Pat<(fdiv fpimm_1, (fsqrt_approx f32:$a)),
           (RSQRT_APPROX_f32 $a)>;
 }
 //
@@ -2231,7 +2211,7 @@ defm INT_PTX_SATOM_XOR  : ATOM2_bitwise_impl<"xor">;
 
 class LDU_G<NVPTXRegClass regclass>
   :  NVPTXInst<(outs regclass:$result), (ins i32imm:$fromWidth, ADDR:$src),
-               "ldu.global.b$fromWidth \t$result, [$src];", []>;
+               "ldu.global.b$fromWidth \t$result, [$src];">;
 
 def LDU_GLOBAL_i16 : LDU_G<B16>;
 def LDU_GLOBAL_i32 : LDU_G<B32>;
@@ -2243,13 +2223,13 @@ def LDU_GLOBAL_i64 : LDU_G<B64>;
 class VLDU_G_ELE_V2<NVPTXRegClass regclass>
   : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
               (ins i32imm:$fromWidth, ADDR:$src),
-              "ldu.global.v2.b$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
+              "ldu.global.v2.b$fromWidth \t{{$dst1, $dst2}}, [$src];">;
 
 
 class VLDU_G_ELE_V4<NVPTXRegClass regclass>
   : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
               (ins i32imm:$fromWidth, ADDR:$src),
-               "ldu.global.v4.b$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+               "ldu.global.v4.b$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];">;
 
 
 def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<B16>;
@@ -2270,9 +2250,8 @@ def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<B32>;
 
 class LDG_G<NVPTXRegClass regclass>
   : NVPTXInst<(outs regclass:$result), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
-               "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
+               "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];">;
 
-def LD_GLOBAL_NC_i8  : LDG_G<B16>;
 def LD_GLOBAL_NC_i16 : LDG_G<B16>;
 def LD_GLOBAL_NC_i32 : LDG_G<B32>;
 def LD_GLOBAL_NC_i64 : LDG_G<B64>;
@@ -2283,19 +2262,19 @@ def LD_GLOBAL_NC_i64 : LDG_G<B64>;
 class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
             (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
-            "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
+            "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];">;
 
 
 class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 
             (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
-            "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+            "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];">;
 
 class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
                   regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
              (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
-             "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
+             "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];">;
 
 // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
 def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<B16>;
@@ -3540,20 +3519,13 @@ multiclass SULD_1D<string inst, NVPTXRegClass outtype> {
   def _I : SULD_1D_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_1D_I8_CLAMP : SULD_1D<"suld.b.1d.b8.clamp", B16>;
-defm SULD_1D_I16_CLAMP : SULD_1D<"suld.b.1d.b16.clamp", B16>;
-defm SULD_1D_I32_CLAMP : SULD_1D<"suld.b.1d.b32.clamp", B32>;
-defm SULD_1D_I64_CLAMP : SULD_1D<"suld.b.1d.b64.clamp", B64>;
-
-defm SULD_1D_I8_TRAP : SULD_1D<"suld.b.1d.b8.trap", B16>;
-defm SULD_1D_I16_TRAP : SULD_1D<"suld.b.1d.b16.trap", B16>;
-defm SULD_1D_I32_TRAP : SULD_1D<"suld.b.1d.b32.trap", B32>;
-defm SULD_1D_I64_TRAP : SULD_1D<"suld.b.1d.b64.trap", B64>;
-
-defm SULD_1D_I8_ZERO : SULD_1D<"suld.b.1d.b8.zero", B16>;
-defm SULD_1D_I16_ZERO : SULD_1D<"suld.b.1d.b16.zero", B16>;
-defm SULD_1D_I32_ZERO : SULD_1D<"suld.b.1d.b32.zero", B32>;
-defm SULD_1D_I64_ZERO : SULD_1D<"suld.b.1d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_1D_I8_ # op_upper : SULD_1D<"suld.b.1d.b8." # op, B16>;
+  defm SULD_1D_I16_ # op_upper : SULD_1D<"suld.b.1d.b16." # op, B16>;
+  defm SULD_1D_I32_ # op_upper : SULD_1D<"suld.b.1d.b32." # op, B32>;
+  defm SULD_1D_I64_ # op_upper : SULD_1D<"suld.b.1d.b64." # op, B64>;
+}
 
 class SULD_1D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf,
                          list<dag> pattern = []>
@@ -3570,20 +3542,13 @@ multiclass SULD_1D_ARRAY<string inst, NVPTXRegClass outtype> {
   def _I : SULD_1D_ARRAY_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_1D_ARRAY_I8_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b8.clamp", B16>;
-defm SULD_1D_ARRAY_I16_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b16.clamp", B16>;
-defm SULD_1D_ARRAY_I32_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b32.clamp", B32>;
-defm SULD_1D_ARRAY_I64_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b64.clamp", B64>;
-
-defm SULD_1D_ARRAY_I8_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b8.trap", B16>;
-defm SULD_1D_ARRAY_I16_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b16.trap", B16>;
-defm SULD_1D_ARRAY_I32_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b32.trap", B32>;
-defm SULD_1D_ARRAY_I64_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b64.trap", B64>;
-
-defm SULD_1D_ARRAY_I8_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b8.zero", B16>;
-defm SULD_1D_ARRAY_I16_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b16.zero", B16>;
-defm SULD_1D_ARRAY_I32_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b32.zero", B32>;
-defm SULD_1D_ARRAY_I64_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_1D_ARRAY_I8_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b8." # op, B16>;
+  defm SULD_1D_ARRAY_I16_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b16." # op, B16>;
+  defm SULD_1D_ARRAY_I32_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b32." # op, B32>;
+  defm SULD_1D_ARRAY_I64_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b64." # op, B64>;
+}
 
 class SULD_2D_base<string inst, NVPTXRegClass outtype, dag surf,
                    list<dag> pattern = []>
@@ -3599,20 +3564,13 @@ multiclass SULD_2D<string inst, NVPTXRegClass outtype> {
   def _I : SULD_2D_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_2D_I8_CLAMP : SULD_2D<"suld.b.2d.b8.clamp", B16>;
-defm SULD_2D_I16_CLAMP : SULD_2D<"suld.b.2d.b16.clamp", B16>;
-defm SULD_2D_I32_CLAMP : SULD_2D<"suld.b.2d.b32.clamp", B32>;
-defm SULD_2D_I64_CLAMP : SULD_2D<"suld.b.2d.b64.clamp", B64>;
-
-defm SULD_2D_I8_TRAP : SULD_2D<"suld.b.2d.b8.trap", B16>;
-defm SULD_2D_I16_TRAP : SULD_2D<"suld.b.2d.b16.trap", B16>;
-defm SULD_2D_I32_TRAP : SULD_2D<"suld.b.2d.b32.trap", B32>;
-defm SULD_2D_I64_TRAP : SULD_2D<"suld.b.2d.b64.trap", B64>;
-
-defm SULD_2D_I8_ZERO : SULD_2D<"suld.b.2d.b8.zero", B16>;
-defm SULD_2D_I16_ZERO : SULD_2D<"suld.b.2d.b16.zero", B16>;
-defm SULD_2D_I32_ZERO : SULD_2D<"suld.b.2d.b32.zero", B32>;
-defm SULD_2D_I64_ZERO : SULD_2D<"suld.b.2d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_2D_I8_ # op_upper : SULD_2D<"suld.b.2d.b8." # op, B16>;
+  defm SULD_2D_I16_ # op_upper : SULD_2D<"suld.b.2d.b16." # op, B16>;
+  defm SULD_2D_I32_ # op_upper : SULD_2D<"suld.b.2d.b32." # op, B32>;
+  defm SULD_2D_I64_ # op_upper : SULD_2D<"suld.b.2d.b64." # op, B64>;
+}
 
 class SULD_2D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf,
                          list<dag> pattern = []>
@@ -3629,20 +3587,13 @@ multiclass SULD_2D_ARRAY<string inst, NVPTXRegClass outtype> {
   def _I : SULD_2D_ARRAY_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_2D_ARRAY_I8_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b8.clamp", B16>;
-defm SULD_2D_ARRAY_I16_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b16.clamp", B16>;
-defm SULD_2D_ARRAY_I32_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b32.clamp", B32>;
-defm SULD_2D_ARRAY_I64_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b64.clamp", B64>;
-
-defm SULD_2D_ARRAY_I8_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b8.trap", B16>;
-defm SULD_2D_ARRAY_I16_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b16.trap", B16>;
-defm SULD_2D_ARRAY_I32_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b32.trap", B32>;
-defm SULD_2D_ARRAY_I64_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b64.trap", B64>;
-
-defm SULD_2D_ARRAY_I8_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b8.zero", B16>;
-defm SULD_2D_ARRAY_I16_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b16.zero", B16>;
-defm SULD_2D_ARRAY_I32_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b32.zero", B32>;
-defm SULD_2D_ARRAY_I64_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_2D_ARRAY_I8_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b8." # op, B16>;
+  defm SULD_2D_ARRAY_I16_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b16." # op, B16>;
+  defm SULD_2D_ARRAY_I32_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b32." # op, B32>;
+  defm SULD_2D_ARRAY_I64_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b64." # op, B64>;
+}
 
 class SULD_3D_base<string inst, NVPTXRegClass outtype, dag surf,
                    list<dag> pattern = []>
@@ -3659,20 +3610,13 @@ multiclass SULD_3D<string inst, NVPTXRegClass outtype> {
   def _I : SULD_3D_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_3D_I8_CLAMP : SULD_3D<"suld.b.3d.b8.clamp", B16>;
-defm SULD_3D_I16_CLAMP : SULD_3D<"suld.b.3d.b16.clamp", B16>;
-defm SULD_3D_I32_CLAMP : SULD_3D<"suld.b.3d.b32.clamp", B32>;
-defm SULD_3D_I64_CLAMP : SULD_3D<"suld.b.3d.b64.clamp", B64>;
-
-defm SULD_3D_I8_TRAP : SULD_3D<"suld.b.3d.b8.trap", B16>;
-defm SULD_3D_I16_TRAP : SULD_3D<"suld.b.3d.b16.trap", B16>;
-defm SULD_3D_I32_TRAP : SULD_3D<"suld.b.3d.b32.trap", B32>;
-defm SULD_3D_I64_TRAP : SULD_3D<"suld.b.3d.b64.trap", B64>;
-
-defm SULD_3D_I8_ZERO : SULD_3D<"suld.b.3d.b8.zero", B16>;
-defm SULD_3D_I16_ZERO : SULD_3D<"suld.b.3d.b16.zero", B16>;
-defm SULD_3D_I32_ZERO : SULD_3D<"suld.b.3d.b32.zero", B32>;
-defm SULD_3D_I64_ZERO : SULD_3D<"suld.b.3d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_3D_I8_ # op_upper : SULD_3D<"suld.b.3d.b8." # op, B16>;
+  defm SULD_3D_I16_ # op_upper : SULD_3D<"suld.b.3d.b16." # op, B16>;
+  defm SULD_3D_I32_ # op_upper : SULD_3D<"suld.b.3d.b32." # op, B32>;
+  defm SULD_3D_I64_ # op_upper : SULD_3D<"suld.b.3d.b64." # op, B64>;
+}
 }
 
 let IsSuld = 2 in {
@@ -3692,20 +3636,13 @@ multiclass SULD_1D_V2<string inst, NVPTXRegClass outtype> {
   def _I : SULD_1D_V2_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_1D_V2I8_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b8.clamp", B16>;
-defm SULD_1D_V2I16_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b16.clamp", B16>;
-defm SULD_1D_V2I32_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b32.clamp", B32>;
-defm SULD_1D_V2I64_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b64.clamp", B64>;
-
-defm SULD_1D_V2I8_TRAP : SULD_1D_V2<"suld.b.1d.v2.b8.trap", B16>;
-defm SULD_1D_V2I16_TRAP : SULD_1D_V2<"suld.b.1d.v2.b16.trap", B16>;
-defm SULD_1D_V2I32_TRAP : SULD_1D_V2<"suld.b.1d.v2.b32.trap", B32>;
-defm SULD_1D_V2I64_TRAP : SULD_1D_V2<"suld.b.1d.v2.b64.trap", B64>;
-
-defm SULD_1D_V2I8_ZERO : SULD_1D_V2<"suld.b.1d.v2.b8.zero", B16>;
-defm SULD_1D_V2I16_ZERO : SULD_1D_V2<"suld.b.1d.v2.b16.zero", B16>;
-defm SULD_1D_V2I32_ZERO : SULD_1D_V2<"suld.b.1d.v2.b32.zero", B32>;
-defm SULD_1D_V2I64_ZERO : SULD_1D_V2<"suld.b.1d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_1D_V2I8_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b8." # op, B16>;
+  defm SULD_1D_V2I16_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b16." # op, B16>;
+  defm SULD_1D_V2I32_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b32." # op, B32>;
+  defm SULD_1D_V2I64_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b64." # op, B64>;
+}
 
 class SULD_1D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf,
                             list<dag> pattern = []>
@@ -3722,20 +3659,13 @@ multiclass SULD_1D_ARRAY_V2<string inst, NVPTXRegClass outtype> {
   def _I : SULD_1D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_1D_ARRAY_V2I8_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.clamp", B16>;
-defm SULD_1D_ARRAY_V2I16_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.clamp", B16>;
-defm SULD_1D_ARRAY_V2I32_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.clamp", B32>;
-defm SULD_1D_ARRAY_V2I64_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.clamp", B64>;
-
-defm SULD_1D_ARRAY_V2I8_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.trap", B16>;
-defm SULD_1D_ARRAY_V2I16_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.trap", B16>;
-defm SULD_1D_ARRAY_V2I32_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.trap", B32>;
-defm SULD_1D_ARRAY_V2I64_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.trap", B64>;
-
-defm SULD_1D_ARRAY_V2I8_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.zero", B16>;
-defm SULD_1D_ARRAY_V2I16_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.zero", B16>;
-defm SULD_1D_ARRAY_V2I32_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.zero", B32>;
-defm SULD_1D_ARRAY_V2I64_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_1D_ARRAY_V2I8_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8." # op, B16>;
+  defm SULD_1D_ARRAY_V2I16_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16." # op, B16>;
+  defm SULD_1D_ARRAY_V2I32_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32." # op, B32>;
+  defm SULD_1D_ARRAY_V2I64_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64." # op, B64>;
+}
 
 class SULD_2D_V2_base<string inst, NVPTXRegClass outtype, dag surf,
                       list<dag> pattern = []>
@@ -3752,20 +3682,13 @@ multiclass SULD_2D_V2<string inst, NVPTXRegClass outtype> {
   def _I : SULD_2D_V2_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_2D_V2I8_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b8.clamp", B16>;
-defm SULD_2D_V2I16_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b16.clamp", B16>;
-defm SULD_2D_V2I32_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b32.clamp", B32>;
-defm SULD_2D_V2I64_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b64.clamp", B64>;
-
-defm SULD_2D_V2I8_TRAP : SULD_2D_V2<"suld.b.2d.v2.b8.trap", B16>;
-defm SULD_2D_V2I16_TRAP : SULD_2D_V2<"suld.b.2d.v2.b16.trap", B16>;
-defm SULD_2D_V2I32_TRAP : SULD_2D_V2<"suld.b.2d.v2.b32.trap", B32>;
-defm SULD_2D_V2I64_TRAP : SULD_2D_V2<"suld.b.2d.v2.b64.trap", B64>;
-
-defm SULD_2D_V2I8_ZERO : SULD_2D_V2<"suld.b.2d.v2.b8.zero", B16>;
-defm SULD_2D_V2I16_ZERO : SULD_2D_V2<"suld.b.2d.v2.b16.zero", B16>;
-defm SULD_2D_V2I32_ZERO : SULD_2D_V2<"suld.b.2d.v2.b32.zero", B32>;
-defm SULD_2D_V2I64_ZERO : SULD_2D_V2<"suld.b.2d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_2D_V2I8_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b8." # op, B16>;
+  defm SULD_2D_V2I16_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b16." # op, B16>;
+  defm SULD_2D_V2I32_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b32." # op, B32>;
+  defm SULD_2D_V2I64_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b64." # op, B64>;
+}
 
 class SULD_2D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf,
                             list<dag> pattern = []>
@@ -3782,20 +3705,13 @@ multiclass SULD_2D_ARRAY_V2<string inst, NVPTXRegClass outtype> {
   def _I : SULD_2D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_2D_ARRAY_V2I8_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.clamp", B16>;
-defm SULD_2D_ARRAY_V2I16_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.clamp", B16>;
-defm SULD_2D_ARRAY_V2I32_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.clamp", B32>;
-defm SULD_2D_ARRAY_V2I64_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.clamp", B64>;
-
-defm SULD_2D_ARRAY_V2I8_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.trap", B16>;
-defm SULD_2D_ARRAY_V2I16_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.trap", B16>;
-defm SULD_2D_ARRAY_V2I32_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.trap", B32>;
-defm SULD_2D_ARRAY_V2I64_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.trap", B64>;
-
-defm SULD_2D_ARRAY_V2I8_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.zero", B16>;
-defm SULD_2D_ARRAY_V2I16_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.zero", B16>;
-defm SULD_2D_ARRAY_V2I32_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.zero", B32>;
-defm SULD_2D_ARRAY_V2I64_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_2D_ARRAY_V2I8_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8." # op, B16>;
+  defm SULD_2D_ARRAY_V2I16_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16." # op, B16>;
+  defm SULD_2D_ARRAY_V2I32_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32." # op, B32>;
+  defm SULD_2D_ARRAY_V2I64_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64." # op, B64>;
+}
 
 class SULD_3D_V2_base<string inst, NVPTXRegClass outtype, dag surf,
                       list<dag> pattern = []>
@@ -3812,20 +3728,13 @@ multiclass SULD_3D_V2<string inst, NVPTXRegClass outtype> {
   def _I : SULD_3D_V2_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_3D_V2I8_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b8.clamp", B16>;
-defm SULD_3D_V2I16_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b16.clamp", B16>;
-defm SULD_3D_V2I32_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b32.clamp", B32>;
-defm SULD_3D_V2I64_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b64.clamp", B64>;
-
-defm SULD_3D_V2I8_TRAP : SULD_3D_V2<"suld.b.3d.v2.b8.trap", B16>;
-defm SULD_3D_V2I16_TRAP : SULD_3D_V2<"suld.b.3d.v2.b16.trap", B16>;
-defm SULD_3D_V2I32_TRAP : SULD_3D_V2<"suld.b.3d.v2.b32.trap", B32>;
-defm SULD_3D_V2I64_TRAP : SULD_3D_V2<"suld.b.3d.v2.b64.trap", B64>;
-
-defm SULD_3D_V2I8_ZERO : SULD_3D_V2<"suld.b.3d.v2.b8.zero", B16>;
-defm SULD_3D_V2I16_ZERO : SULD_3D_V2<"suld.b.3d.v2.b16.zero", B16>;
-defm SULD_3D_V2I32_ZERO : SULD_3D_V2<"suld.b.3d.v2.b32.zero", B32>;
-defm SULD_3D_V2I64_ZERO : SULD_3D_V2<"suld.b.3d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_3D_V2I8_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b8." # op, B16>;
+  defm SULD_3D_V2I16_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b16." # op, B16>;
+  defm SULD_3D_V2I32_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b32." # op, B32>;
+  defm SULD_3D_V2I64_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b64." # op, B64>;
+}
 
 }
 
@@ -3846,17 +3755,12 @@ multiclass SULD_1D_V4<string inst, NVPTXRegClass outtype> {
   def _I : SULD_1D_V4_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_1D_V4I8_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b8.clamp", B16>;
-defm SULD_1D_V4I16_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b16.clamp", B16>;
-defm SULD_1D_V4I32_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b32.clamp", B32>;
-
-defm SULD_1D_V4I8_TRAP : SULD_1D_V4<"suld.b.1d.v4.b8.trap", B16>;
-defm SULD_1D_V4I16_TRAP : SULD_1D_V4<"suld.b.1d.v4.b16.trap", B16>;
-defm SULD_1D_V4I32_TRAP : SULD_1D_V4<"suld.b.1d.v4.b32.trap", B32>;
-
-defm SULD_1D_V4I8_ZERO : SULD_1D_V4<"suld.b.1d.v4.b8.zero", B16>;
-defm SULD_1D_V4I16_ZERO : SULD_1D_V4<"suld.b.1d.v4.b16.zero", B16>;
-defm SULD_1D_V4I32_ZERO : SULD_1D_V4<"suld.b.1d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_1D_V4I8_ # op_upper : SULD_1D_V4<"suld.b.1d.v4.b8." # op, B16>;
+  defm SULD_1D_V4I16_ # op_upper : SULD_1D_V4<"suld.b.1d.v4.b16." # op, B16>;
+  defm SULD_1D_V4I32_ # op_upper : SULD_1D_V4<"suld.b.1d.v4.b32." # op, B32>;
+}
 
 class SULD_1D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf,
                             list<dag> pattern = []>
@@ -3874,17 +3778,12 @@ multiclass SULD_1D_ARRAY_V4<string inst, NVPTXRegClass outtype> {
   def _I : SULD_1D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_1D_ARRAY_V4I8_CLAMP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.clamp", B16>;
-defm SULD_1D_ARRAY_V4I16_CLAMP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.clamp", B16>;
-defm SULD_1D_ARRAY_V4I32_CLAMP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.clamp", B32>;
-
-defm SULD_1D_ARRAY_V4I8_TRAP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.trap", B16>;
-defm SULD_1D_ARRAY_V4I16_TRAP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.trap", B16>;
-defm SULD_1D_ARRAY_V4I32_TRAP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.trap", B32>;
-
-defm SULD_1D_ARRAY_V4I8_ZERO : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.zero", B16>;
-defm SULD_1D_ARRAY_V4I16_ZERO : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.zero", B16>;
-defm SULD_1D_ARRAY_V4I32_ZERO : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_1D_ARRAY_V4I8_ # op_upper : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8." # op, B16>;
+  defm SULD_1D_ARRAY_V4I16_ # op_upper : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16." # op, B16>;
+  defm SULD_1D_ARRAY_V4I32_ # op_upper : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32." # op, B32>;
+}
 
 class SULD_2D_V4_base<string inst, NVPTXRegClass outtype, dag surf,
                       list<dag> pattern = []>
@@ -3901,17 +3800,12 @@ multiclass SULD_2D_V4<string inst, NVPTXRegClass outtype> {
   def _I : SULD_2D_V4_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_2D_V4I8_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b8.clamp", B16>;
-defm SULD_2D_V4I16_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b16.clamp", B16>;
-defm SULD_2D_V4I32_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b32.clamp", B32>;
-
-defm SULD_2D_V4I8_TRAP : SULD_2D_V4<"suld.b.2d.v4.b8.trap", B16>;
-defm SULD_2D_V4I16_TRAP : SULD_2D_V4<"suld.b.2d.v4.b16.trap", B16>;
-defm SULD_2D_V4I32_TRAP : SULD_2D_V4<"suld.b.2d.v4.b32.trap", B32>;
-
-defm SULD_2D_V4I8_ZERO : SULD_2D_V4<"suld.b.2d.v4.b8.zero", B16>;
-defm SULD_2D_V4I16_ZERO : SULD_2D_V4<"suld.b.2d.v4.b16.zero", B16>;
-defm SULD_2D_V4I32_ZERO : SULD_2D_V4<"suld.b.2d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_2D_V4I8_ # op_upper : SULD_2D_V4<"suld.b.2d.v4.b8." # op, B16>;
+  defm SULD_2D_V4I16_ # op_upper : SULD_2D_V4<"suld.b.2d.v4.b16." # op, B16>;
+  defm SULD_2D_V4I32_ # op_upper : SULD_2D_V4<"suld.b.2d.v4.b32." # op, B32>;
+}
 
 class SULD_2D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf,
                             list<dag> pattern = []>
@@ -3929,17 +3823,12 @@ multiclass SULD_2D_ARRAY_V4<string inst, NVPTXRegClass outtype> {
   def _I : SULD_2D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_2D_ARRAY_V4I8_CLAMP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.clamp", B16>;
-defm SULD_2D_ARRAY_V4I16_CLAMP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.clamp", B16>;
-defm SULD_2D_ARRAY_V4I32_CLAMP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.clamp", B32>;
-
-defm SULD_2D_ARRAY_V4I8_TRAP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.trap", B16>;
-defm SULD_2D_ARRAY_V4I16_TRAP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.trap", B16>;
-defm SULD_2D_ARRAY_V4I32_TRAP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.trap", B32>;
-
-defm SULD_2D_ARRAY_V4I8_ZERO : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.zero", B16>;
-defm SULD_2D_ARRAY_V4I16_ZERO : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.zero", B16>;
-defm SULD_2D_ARRAY_V4I32_ZERO : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_2D_ARRAY_V4I8_ # op_upper : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8." # op, B16>;
+  defm SULD_2D_ARRAY_V4I16_ # op_upper : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16." # op, B16>;
+  defm SULD_2D_ARRAY_V4I32_ # op_upper : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32." # op, B32>;
+}
 
 class SULD_3D_V4_base<string inst, NVPTXRegClass outtype, dag surf,
                       list<dag> pattern = []>
@@ -3956,17 +3845,12 @@ multiclass SULD_3D_V4<string inst, NVPTXRegClass outtype> {
   def _I : SULD_3D_V4_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-defm SULD_3D_V4I8_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b8.clamp", B16>;
-defm SULD_3D_V4I16_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b16.clamp", B16>;
-defm SULD_3D_V4I32_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b32.clamp", B32>;
-
-defm SULD_3D_V4I8_TRAP : SULD_3D_V4<"suld.b.3d.v4.b8.trap", B16>;
-defm SULD_3D_V4I16_TRAP : SULD_3D_V4<"suld.b.3d.v4.b16.trap", B16>;
-defm SULD_3D_V4I32_TRAP : SULD_3D_V4<"suld.b.3d.v4.b32.trap", B32>;
-
-defm SULD_3D_V4I8_ZERO : SULD_3D_V4<"suld.b.3d.v4.b8.zero", B16>;
-defm SULD_3D_V4I16_ZERO : SULD_3D_V4<"suld.b.3d.v4.b16.zero", B16>;
-defm SULD_3D_V4I32_ZERO : SULD_3D_V4<"suld.b.3d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SULD_3D_V4I8_ # op_upper : SULD_3D_V4<"suld.b.3d.v4.b8." # op, B16>;
+  defm SULD_3D_V4I16_ # op_upper : SULD_3D_V4<"suld.b.3d.v4.b16." # op, B16>;
+  defm SULD_3D_V4I32_ # op_upper : SULD_3D_V4<"suld.b.3d.v4.b32." # op, B32>;
+}
 
 }
 
@@ -4037,20 +3921,13 @@ multiclass SUST_1D<string inst, NVPTXRegClass intype> {
   def _I : SUST_1D_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_I8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", B16>;
-defm SUST_B_1D_I16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", B16>;
-defm SUST_B_1D_I32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", B32>;
-defm SUST_B_1D_I64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", B64>;
-
-defm SUST_B_1D_I8_TRAP : SUST_1D<"sust.b.1d.b8.trap", B16>;
-defm SUST_B_1D_I16_TRAP : SUST_1D<"sust.b.1d.b16.trap", B16>;
-defm SUST_B_1D_I32_TRAP : SUST_1D<"sust.b.1d.b32.trap", B32>;
-defm SUST_B_1D_I64_TRAP : SUST_1D<"sust.b.1d.b64.trap", B64>;
-
-defm SUST_B_1D_I8_ZERO : SUST_1D<"sust.b.1d.b8.zero", B16>;
-defm SUST_B_1D_I16_ZERO : SUST_1D<"sust.b.1d.b16.zero", B16>;
-defm SUST_B_1D_I32_ZERO : SUST_1D<"sust.b.1d.b32.zero", B32>;
-defm SUST_B_1D_I64_ZERO : SUST_1D<"sust.b.1d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_1D_I8_ # op_upper : SUST_1D<"sust.b.1d.b8." # op, B16>;
+  defm SUST_B_1D_I16_ # op_upper : SUST_1D<"sust.b.1d.b16." # op, B16>;
+  defm SUST_B_1D_I32_ # op_upper : SUST_1D<"sust.b.1d.b32." # op, B32>;
+  defm SUST_B_1D_I64_ # op_upper : SUST_1D<"sust.b.1d.b64." # op, B64>;
+}
 
 defm SUST_P_1D_I8_TRAP : SUST_1D<"sust.p.1d.b8.trap", B16>;
 defm SUST_P_1D_I16_TRAP : SUST_1D<"sust.p.1d.b16.trap", B16>;
@@ -4068,23 +3945,13 @@ multiclass SUST_1D_V2<string inst, NVPTXRegClass intype> {
   def _I : SUST_1D_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-// int_nvvm_sust_b_1d_v2i8_clamp
-
-defm SUST_B_1D_V2I8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", B16>;
-defm SUST_B_1D_V2I16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", B16>;
-defm SUST_B_1D_V2I32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", B32>;
-defm SUST_B_1D_V2I64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", B64>;
-
-defm SUST_B_1D_V2I8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", B16>;
-defm SUST_B_1D_V2I16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", B16>;
-defm SUST_B_1D_V2I32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", B32>;
-defm SUST_B_1D_V2I64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", B64>;
-
-defm SUST_B_1D_V2I8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", B16>;
-defm SUST_B_1D_V2I16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", B16>;
-defm SUST_B_1D_V2I32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", B32>;
-defm SUST_B_1D_V2I64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", B64>;
-
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_1D_V2I8_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b8." # op, B16>;
+  defm SUST_B_1D_V2I16_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b16." # op, B16>;
+  defm SUST_B_1D_V2I32_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b32." # op, B32>;
+  defm SUST_B_1D_V2I64_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b64." # op, B64>;
+}
 defm SUST_P_1D_V2I8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", B16>;
 defm SUST_P_1D_V2I16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", B16>;
 defm SUST_P_1D_V2I32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", B32>;
@@ -4103,17 +3970,12 @@ multiclass SUST_1D_V4<string inst, NVPTXRegClass intype> {
   def _I : SUST_1D_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_V4I8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", B16>;
-defm SUST_B_1D_V4I16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", B16>;
-defm SUST_B_1D_V4I32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", B32>;
-
-defm SUST_B_1D_V4I8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", B16>;
-defm SUST_B_1D_V4I16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", B16>;
-defm SUST_B_1D_V4I32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", B32>;
-
-defm SUST_B_1D_V4I8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", B16>;
-defm SUST_B_1D_V4I16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", B16>;
-defm SUST_B_1D_V4I32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_1D_V4I8_ # op_upper : SUST_1D_V4<"sust.b.1d.v4.b8." # op, B16>;
+  defm SUST_B_1D_V4I16_ # op_upper : SUST_1D_V4<"sust.b.1d.v4.b16." # op, B16>;
+  defm SUST_B_1D_V4I32_ # op_upper : SUST_1D_V4<"sust.b.1d.v4.b32." # op, B32>;
+}
 
 defm SUST_P_1D_V4I8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", B16>;
 defm SUST_P_1D_V4I16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", B16>;
@@ -4131,20 +3993,13 @@ multiclass SUST_1D_ARRAY<string inst, NVPTXRegClass intype> {
   def _I : SUST_1D_ARRAY_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_ARRAY_I8_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b8.clamp", B16>;
-defm SUST_B_1D_ARRAY_I16_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b16.clamp", B16>;
-defm SUST_B_1D_ARRAY_I32_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b32.clamp", B32>;
-defm SUST_B_1D_ARRAY_I64_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b64.clamp", B64>;
-
-defm SUST_B_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b8.trap", B16>;
-defm SUST_B_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b16.trap", B16>;
-defm SUST_B_1D_ARRAY_I32_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b32.trap", B32>;
-defm SUST_B_1D_ARRAY_I64_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b64.trap", B64>;
-
-defm SUST_B_1D_ARRAY_I8_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b8.zero", B16>;
-defm SUST_B_1D_ARRAY_I16_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b16.zero", B16>;
-defm SUST_B_1D_ARRAY_I32_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b32.zero", B32>;
-defm SUST_B_1D_ARRAY_I64_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_1D_ARRAY_I8_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b8." # op, B16>;
+  defm SUST_B_1D_ARRAY_I16_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b16." # op, B16>;
+  defm SUST_B_1D_ARRAY_I32_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b32." # op, B32>;
+  defm SUST_B_1D_ARRAY_I64_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b64." # op, B64>;
+}
 
 defm SUST_P_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b8.trap", B16>;
 defm SUST_P_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b16.trap", B16>;
@@ -4164,20 +4019,13 @@ multiclass SUST_1D_ARRAY_V2<string inst, NVPTXRegClass intype> {
   def _I : SUST_1D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_ARRAY_V2I8_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.clamp", B16>;
-defm SUST_B_1D_ARRAY_V2I16_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.clamp", B16>;
-defm SUST_B_1D_ARRAY_V2I32_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.clamp", B32>;
-defm SUST_B_1D_ARRAY_V2I64_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.clamp", B64>;
-
-defm SUST_B_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.trap", B16>;
-defm SUST_B_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.trap", B16>;
-defm SUST_B_1D_ARRAY_V2I32_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.trap", B32>;
-defm SUST_B_1D_ARRAY_V2I64_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.trap", B64>;
-
-defm SUST_B_1D_ARRAY_V2I8_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.zero", B16>;
-defm SUST_B_1D_ARRAY_V2I16_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.zero", B16>;
-defm SUST_B_1D_ARRAY_V2I32_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.zero", B32>;
-defm SUST_B_1D_ARRAY_V2I64_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_1D_ARRAY_V2I8_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8." # op, B16>;
+  defm SUST_B_1D_ARRAY_V2I16_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16." # op, B16>;
+  defm SUST_B_1D_ARRAY_V2I32_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32." # op, B32>;
+  defm SUST_B_1D_ARRAY_V2I64_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64." # op, B64>;
+}
 
 defm SUST_P_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b8.trap", B16>;
 defm SUST_P_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b16.trap", B16>;
@@ -4197,33 +4045,16 @@ multiclass SUST_1D_ARRAY_V4<string inst, NVPTXRegClass intype> {
   def _I : SUST_1D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_ARRAY_V4I8_CLAMP
-  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.clamp", B16>;
-defm SUST_B_1D_ARRAY_V4I16_CLAMP
-  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.clamp", B16>;
-defm SUST_B_1D_ARRAY_V4I32_CLAMP
-  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.clamp", B32>;
-
-defm SUST_B_1D_ARRAY_V4I8_TRAP
-  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.trap", B16>;
-defm SUST_B_1D_ARRAY_V4I16_TRAP
-  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.trap", B16>;
-defm SUST_B_1D_ARRAY_V4I32_TRAP
-  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.trap", B32>;
-
-defm SUST_B_1D_ARRAY_V4I8_ZERO
-  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.zero", B16>;
-defm SUST_B_1D_ARRAY_V4I16_ZERO
-  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.zero", B16>;
-defm SUST_B_1D_ARRAY_V4I32_ZERO
-  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.zero", B32>;
-
-defm SUST_P_1D_ARRAY_V4I8_TRAP
-  : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", B16>;
-defm SUST_P_1D_ARRAY_V4I16_TRAP
-  : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", B16>;
-defm SUST_P_1D_ARRAY_V4I32_TRAP
-  : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_1D_ARRAY_V4I8_ # op_upper : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8." # op, B16>;
+  defm SUST_B_1D_ARRAY_V4I16_ # op_upper : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16." # op, B16>;
+  defm SUST_B_1D_ARRAY_V4I32_ # op_upper : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32." # op, B32>;
+}
+
+defm SUST_P_1D_ARRAY_V4I8_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", B16>;
+defm SUST_P_1D_ARRAY_V4I16_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", B16>;
+defm SUST_P_1D_ARRAY_V4I32_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", B32>;
 
 class SUST_2D_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
@@ -4237,20 +4068,13 @@ multiclass SUST_2D<string inst, NVPTXRegClass intype> {
   def _I : SUST_2D_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_I8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", B16>;
-defm SUST_B_2D_I16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", B16>;
-defm SUST_B_2D_I32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", B32>;
-defm SUST_B_2D_I64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", B64>;
-
-defm SUST_B_2D_I8_TRAP : SUST_2D<"sust.b.2d.b8.trap", B16>;
-defm SUST_B_2D_I16_TRAP : SUST_2D<"sust.b.2d.b16.trap", B16>;
-defm SUST_B_2D_I32_TRAP : SUST_2D<"sust.b.2d.b32.trap", B32>;
-defm SUST_B_2D_I64_TRAP : SUST_2D<"sust.b.2d.b64.trap", B64>;
-
-defm SUST_B_2D_I8_ZERO : SUST_2D<"sust.b.2d.b8.zero", B16>;
-defm SUST_B_2D_I16_ZERO : SUST_2D<"sust.b.2d.b16.zero", B16>;
-defm SUST_B_2D_I32_ZERO : SUST_2D<"sust.b.2d.b32.zero", B32>;
-defm SUST_B_2D_I64_ZERO : SUST_2D<"sust.b.2d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_2D_I8_ # op_upper : SUST_2D<"sust.b.2d.b8." # op, B16>;
+  defm SUST_B_2D_I16_ # op_upper : SUST_2D<"sust.b.2d.b16." # op, B16>;
+  defm SUST_B_2D_I32_ # op_upper : SUST_2D<"sust.b.2d.b32." # op, B32>;
+  defm SUST_B_2D_I64_ # op_upper : SUST_2D<"sust.b.2d.b64." # op, B64>;
+}
 
 defm SUST_P_2D_I8_TRAP : SUST_2D<"sust.p.2d.b8.trap", B16>;
 defm SUST_P_2D_I16_TRAP : SUST_2D<"sust.p.2d.b16.trap", B16>;
@@ -4270,20 +4094,13 @@ multiclass SUST_2D_V2<string inst, NVPTXRegClass intype> {
   def _I : SUST_2D_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_V2I8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", B16>;
-defm SUST_B_2D_V2I16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", B16>;
-defm SUST_B_2D_V2I32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", B32>;
-defm SUST_B_2D_V2I64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", B64>;
-
-defm SUST_B_2D_V2I8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", B16>;
-defm SUST_B_2D_V2I16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", B16>;
-defm SUST_B_2D_V2I32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", B32>;
-defm SUST_B_2D_V2I64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", B64>;
-
-defm SUST_B_2D_V2I8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", B16>;
-defm SUST_B_2D_V2I16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", B16>;
-defm SUST_B_2D_V2I32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", B32>;
-defm SUST_B_2D_V2I64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_2D_V2I8_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b8." # op, B16>;
+  defm SUST_B_2D_V2I16_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b16." # op, B16>;
+  defm SUST_B_2D_V2I32_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b32." # op, B32>;
+  defm SUST_B_2D_V2I64_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b64." # op, B64>;
+}
 
 defm SUST_P_2D_V2I8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", B16>;
 defm SUST_P_2D_V2I16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", B16>;
@@ -4303,17 +4120,12 @@ multiclass SUST_2D_V4<string inst, NVPTXRegClass intype> {
   def _I : SUST_2D_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_V4I8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", B16>;
-defm SUST_B_2D_V4I16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", B16>;
-defm SUST_B_2D_V4I32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", B32>;
-
-defm SUST_B_2D_V4I8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", B16>;
-defm SUST_B_2D_V4I16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", B16>;
-defm SUST_B_2D_V4I32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", B32>;
-
-defm SUST_B_2D_V4I8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", B16>;
-defm SUST_B_2D_V4I16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", B16>;
-defm SUST_B_2D_V4I32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_2D_V4I8_ # op_upper : SUST_2D_V4<"sust.b.2d.v4.b8." # op, B16>;
+  defm SUST_B_2D_V4I16_ # op_upper : SUST_2D_V4<"sust.b.2d.v4.b16." # op, B16>;
+  defm SUST_B_2D_V4I32_ # op_upper : SUST_2D_V4<"sust.b.2d.v4.b32." # op, B32>;
+}
 
 defm SUST_P_2D_V4I8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", B16>;
 defm SUST_P_2D_V4I16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", B16>;
@@ -4333,20 +4145,13 @@ multiclass SUST_2D_ARRAY<string inst, NVPTXRegClass intype> {
   def _I : SUST_2D_ARRAY_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_ARRAY_I8_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b8.clamp", B16>;
-defm SUST_B_2D_ARRAY_I16_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b16.clamp", B16>;
-defm SUST_B_2D_ARRAY_I32_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b32.clamp", B32>;
-defm SUST_B_2D_ARRAY_I64_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b64.clamp", B64>;
-
-defm SUST_B_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b8.trap", B16>;
-defm SUST_B_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b16.trap", B16>;
-defm SUST_B_2D_ARRAY_I32_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b32.trap", B32>;
-defm SUST_B_2D_ARRAY_I64_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b64.trap", B64>;
-
-defm SUST_B_2D_ARRAY_I8_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b8.zero", B16>;
-defm SUST_B_2D_ARRAY_I16_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b16.zero", B16>;
-defm SUST_B_2D_ARRAY_I32_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b32.zero", B32>;
-defm SUST_B_2D_ARRAY_I64_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_2D_ARRAY_I8_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b8." # op, B16>;
+  defm SUST_B_2D_ARRAY_I16_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b16." # op, B16>;
+  defm SUST_B_2D_ARRAY_I32_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b32." # op, B32>;
+  defm SUST_B_2D_ARRAY_I64_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b64." # op, B64>;
+}
 
 defm SUST_P_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b8.trap", B16>;
 defm SUST_P_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b16.trap", B16>;
@@ -4366,20 +4171,13 @@ multiclass SUST_2D_ARRAY_V2<string inst, NVPTXRegClass intype> {
   def _I : SUST_2D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_ARRAY_V2I8_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.clamp", B16>;
-defm SUST_B_2D_ARRAY_V2I16_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.clamp", B16>;
-defm SUST_B_2D_ARRAY_V2I32_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.clamp", B32>;
-defm SUST_B_2D_ARRAY_V2I64_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.clamp", B64>;
-
-defm SUST_B_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.trap", B16>;
-defm SUST_B_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.trap", B16>;
-defm SUST_B_2D_ARRAY_V2I32_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.trap", B32>;
-defm SUST_B_2D_ARRAY_V2I64_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.trap", B64>;
-
-defm SUST_B_2D_ARRAY_V2I8_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.zero", B16>;
-defm SUST_B_2D_ARRAY_V2I16_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.zero", B16>;
-defm SUST_B_2D_ARRAY_V2I32_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.zero", B32>;
-defm SUST_B_2D_ARRAY_V2I64_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_2D_ARRAY_V2I8_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8." # op, B16>;
+  defm SUST_B_2D_ARRAY_V2I16_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16." # op, B16>;
+  defm SUST_B_2D_ARRAY_V2I32_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32." # op, B32>;
+  defm SUST_B_2D_ARRAY_V2I64_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64." # op, B64>;
+}
 
 defm SUST_P_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b8.trap", B16>;
 defm SUST_P_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b16.trap", B16>;
@@ -4399,17 +4197,12 @@ multiclass SUST_2D_ARRAY_V4<string inst, NVPTXRegClass intype> {
   def _I : SUST_2D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_ARRAY_V4I8_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.clamp", B16>;
-defm SUST_B_2D_ARRAY_V4I16_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.clamp", B16>;
-defm SUST_B_2D_ARRAY_V4I32_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.clamp", B32>;
-
-defm SUST_B_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.trap", B16>;
-defm SUST_B_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.trap", B16>;
-defm SUST_B_2D_ARRAY_V4I32_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.trap", B32>;
-
-defm SUST_B_2D_ARRAY_V4I8_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.zero", B16>;
-defm SUST_B_2D_ARRAY_V4I16_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.zero", B16>;
-defm SUST_B_2D_ARRAY_V4I32_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_2D_ARRAY_V4I8_ # op_upper : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8." # op, B16>;
+  defm SUST_B_2D_ARRAY_V4I16_ # op_upper : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16." # op, B16>;
+  defm SUST_B_2D_ARRAY_V4I32_ # op_upper : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32." # op, B32>;
+}
 
 defm SUST_P_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b8.trap", B16>;
 defm SUST_P_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b16.trap", B16>;
@@ -4429,21 +4222,13 @@ multiclass SUST_3D<string inst, NVPTXRegClass intype> {
   def _I : SUST_3D_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_3D_I8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", B16>;
-defm SUST_B_3D_I16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", B16>;
-defm SUST_B_3D_I32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", B32>;
-defm SUST_B_3D_I64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", B64>;
-
-defm SUST_B_3D_I8_TRAP : SUST_3D<"sust.b.3d.b8.trap", B16>;
-defm SUST_B_3D_I16_TRAP : SUST_3D<"sust.b.3d.b16.trap", B16>;
-defm SUST_B_3D_I32_TRAP : SUST_3D<"sust.b.3d.b32.trap", B32>;
-defm SUST_B_3D_I64_TRAP : SUST_3D<"sust.b.3d.b64.trap", B64>;
-
-defm SUST_B_3D_I8_ZERO : SUST_3D<"sust.b.3d.b8.zero", B16>;
-defm SUST_B_3D_I16_ZERO : SUST_3D<"sust.b.3d.b16.zero", B16>;
-defm SUST_B_3D_I32_ZERO : SUST_3D<"sust.b.3d.b32.zero", B32>;
-defm SUST_B_3D_I64_ZERO : SUST_3D<"sust.b.3d.b64.zero", B64>;
-
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_3D_I8_ # op_upper : SUST_3D<"sust.b.3d.b8." # op, B16>;
+  defm SUST_B_3D_I16_ # op_upper : SUST_3D<"sust.b.3d.b16." # op, B16>;
+  defm SUST_B_3D_I32_ # op_upper : SUST_3D<"sust.b.3d.b32." # op, B32>;
+  defm SUST_B_3D_I64_ # op_upper : SUST_3D<"sust.b.3d.b64." # op, B64>;
+}
 defm SUST_P_3D_I8_TRAP : SUST_3D<"sust.p.3d.b8.trap", B16>;
 defm SUST_P_3D_I16_TRAP : SUST_3D<"sust.p.3d.b16.trap", B16>;
 defm SUST_P_3D_I32_TRAP : SUST_3D<"sust.p.3d.b32.trap", B32>;
@@ -4462,21 +4247,13 @@ multiclass SUST_3D_V2<string inst, NVPTXRegClass intype> {
   def _I : SUST_3D_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_3D_V2I8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", B16>;
-defm SUST_B_3D_V2I16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", B16>;
-defm SUST_B_3D_V2I32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", B32>;
-defm SUST_B_3D_V2I64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", B64>;
-
-defm SUST_B_3D_V2I8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", B16>;
-defm SUST_B_3D_V2I16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", B16>;
-defm SUST_B_3D_V2I32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", B32>;
-defm SUST_B_3D_V2I64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", B64>;
-
-defm SUST_B_3D_V2I8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", B16>;
-defm SUST_B_3D_V2I16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", B16>;
-defm SUST_B_3D_V2I32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", B32>;
-defm SUST_B_3D_V2I64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", B64>;
-
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_3D_V2I8_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b8." # op, B16>;
+  defm SUST_B_3D_V2I16_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b16." # op, B16>;
+  defm SUST_B_3D_V2I32_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b32." # op, B32>;
+  defm SUST_B_3D_V2I64_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b64." # op, B64>;
+}
 defm SUST_P_3D_V2I8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", B16>;
 defm SUST_P_3D_V2I16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", B16>;
 defm SUST_P_3D_V2I32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", B32>;
@@ -4495,17 +4272,12 @@ multiclass SUST_3D_V4<string inst, NVPTXRegClass intype> {
   def _I : SUST_3D_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_3D_V4I8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", B16>;
-defm SUST_B_3D_V4I16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", B16>;
-defm SUST_B_3D_V4I32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", B32>;
-
-defm SUST_B_3D_V4I8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", B16>;
-defm SUST_B_3D_V4I16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", B16>;
-defm SUST_B_3D_V4I32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", B32>;
-
-defm SUST_B_3D_V4I8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", B16>;
-defm SUST_B_3D_V4I16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", B16>;
-defm SUST_B_3D_V4I32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+  defvar op_upper = !toupper(op);
+  defm SUST_B_3D_V4I8_ # op_upper : SUST_3D_V4<"sust.b.3d.v4.b8." # op, B16>;
+  defm SUST_B_3D_V4I16_ # op_upper : SUST_3D_V4<"sust.b.3d.v4.b16." # op, B16>;
+  defm SUST_B_3D_V4I32_ # op_upper : SUST_3D_V4<"sust.b.3d.v4.b32." # op, B32>;
+}
 
 defm SUST_P_3D_V4I8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", B16>;
 defm SUST_P_3D_V4I16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", B16>;
@@ -5122,27 +4894,23 @@ defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_align
 //
 // WGMMA fence instructions
 //
-let isConvergent = true in {
-def INT_NVVM_WGMMA_FENCE_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins), "wgmma.fence.sync.aligned",
-                             [(int_nvvm_wgmma_fence_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>;
+let isConvergent = true, Predicates = [hasSM90a, hasPTX<80>] in {
+  def WGMMA_FENCE_SYNC_ALIGNED : NullaryInst<"wgmma.fence.sync.aligned", int_nvvm_wgmma_fence_sync_aligned>;
 
-def INT_NVVM_WGMMA_COMMIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins), "wgmma.commit_group.sync.aligned",
-                             [(int_nvvm_wgmma_commit_group_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>;
+  def WGMMA_COMMIT_GROUP_SYNC_ALIGNED : NullaryInst<"wgmma.commit_group.sync.aligned", int_nvvm_wgmma_commit_group_sync_aligned>;
 
-def INT_NVVM_WGMMA_WAIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned",
-                             [(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>, Requires<[hasSM90a, hasPTX<80>]>;
-} // isConvergent = true
+  def WGMMA_WAIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned",
+                              [(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>;
+}
 
 let Predicates = [hasSM<90>, hasPTX<78>] in {
   def GRIDDEPCONTROL_LAUNCH_DEPENDENTS :
-        BasicNVPTXInst<(outs), (ins), "griddepcontrol.launch_dependents",
-                  [(int_nvvm_griddepcontrol_launch_dependents)]>;
+        NullaryInst<"griddepcontrol.launch_dependents", int_nvvm_griddepcontrol_launch_dependents>;
   def GRIDDEPCONTROL_WAIT :
-        BasicNVPTXInst<(outs), (ins), "griddepcontrol.wait",
-                  [(int_nvvm_griddepcontrol_wait)]>;
+        NullaryInst<"griddepcontrol.wait", int_nvvm_griddepcontrol_wait>;
 }
 
-def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>;
+def EXIT : NullaryInst<"exit", int_nvvm_exit>;
 
 // Tcgen05 intrinsics
 let isConvergent = true, Predicates = [hasTcgen05Instructions] in {
@@ -5170,9 +4938,7 @@ defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1
 defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>;
 
 multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> {
-  def "" : BasicNVPTXInst<(outs), (ins),
-             "tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned",
-             [(Intr)]>;
+  def "" : NullaryInst<"tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned", Intr>;
 }
 defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>;
 defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index f4362fe..e2bbe57 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -412,6 +412,22 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
   }
 }
 
+// Create a call to the nvvm_internal_addrspace_wrap intrinsic and set the
+// alignment of the return value based on the alignment of the argument.
+static CallInst *createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB,
+                                                 Argument &Arg) {
+  CallInst *ArgInParam =
+      IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
+                          {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
+                          &Arg, {}, Arg.getName() + ".param");
+
+  if (MaybeAlign ParamAlign = Arg.getParamAlign())
+    ArgInParam->addRetAttr(
+        Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));
+
+  return ArgInParam;
+}
+
 namespace {
 struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
   using Base = PtrUseVisitor<ArgUseChecker>;
@@ -515,10 +531,7 @@ void copyByValParam(Function &F, Argument &Arg) {
       Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));
   Arg.replaceAllUsesWith(AllocA);
 
-  Value *ArgInParam =
-      IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
-                          {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
-                          &Arg, {}, Arg.getName());
+  CallInst *ArgInParam = createNVVMInternalAddrspaceWrap(IRB, Arg);
 
   // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
   // addrspacecast preserves alignment.  Since params are constant, this load
@@ -549,9 +562,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
     SmallVector<Use *, 16> UsesToUpdate(llvm::make_pointer_range(Arg->uses()));
 
     IRBuilder<> IRB(&*FirstInst);
-    Value *ArgInParamAS = IRB.CreateIntrinsic(
-        Intrinsic::nvvm_internal_addrspace_wrap,
-        {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg});
+    CallInst *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg);
 
     for (Use *U : UsesToUpdate)
       convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
@@ -581,10 +592,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
     // argument already in the param address space, we need to use the noop
     // intrinsic, this had the added benefit of preventing other optimizations
     // from folding away this pair of addrspacecasts.
-    auto *ParamSpaceArg =
-        IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
-                            {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()},
-                            Arg, {}, Arg->getName() + ".param");
+    auto *ParamSpaceArg = createNVVMInternalAddrspaceWrap(IRB, *Arg);
 
     // Cast param address to generic address space.
     Value *GenericArg = IRB.CreateAddrSpaceCast(
diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
index 46aa27e..c8e576f 100644
--- a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
+++ b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
@@ -93,7 +93,7 @@ static bool clobbersCTR(const MachineInstr &MI) {
 static bool verifyCTRBranch(MachineBasicBlock *MBB,
                             MachineBasicBlock::iterator I) {
   MachineBasicBlock::iterator BI = I;
-  SmallSet<MachineBasicBlock *, 16>   Visited;
+  SmallPtrSet<MachineBasicBlock *, 16> Visited;
   SmallVector<MachineBasicBlock *, 8> Preds;
   bool CheckPreds;
 
diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.h b/llvm/lib/Target/PowerPC/PPCCallingConv.h
index ab61472..9c47142 100644
--- a/llvm/lib/Target/PowerPC/PPCCallingConv.h
+++ b/llvm/lib/Target/PowerPC/PPCCallingConv.h
@@ -21,28 +21,29 @@ namespace llvm {
 
 bool RetCC_PPC(unsigned ValNo, MVT ValVT, MVT LocVT,
                CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-               CCState &State);
+               Type *OrigTy, CCState &State);
 bool RetCC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT,
                          CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                         CCState &State);
+                         Type *OrigTy, CCState &State);
 bool RetCC_PPC_Cold(unsigned ValNo, MVT ValVT, MVT LocVT,
                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                    CCState &State);
+                    Type *OrigTy, CCState &State);
 bool CC_PPC32_SVR4(unsigned ValNo, MVT ValVT, MVT LocVT,
                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                   CCState &State);
+                   Type *OrigTy, CCState &State);
 bool CC_PPC64_ELF(unsigned ValNo, MVT ValVT, MVT LocVT,
                   CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                  CCState &State);
+                  Type *OrigTy, CCState &State);
 bool CC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT,
                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                      CCState &State);
+                      Type *OrigTy, CCState &State);
 bool CC_PPC32_SVR4_ByVal(unsigned ValNo, MVT ValVT, MVT LocVT,
                          CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                         CCState &State);
+                         Type *OrigTy, CCState &State);
 bool CC_PPC32_SVR4_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                           CCValAssign::LocInfo LocInfo,
-                          ISD::ArgFlagsTy ArgFlags, CCState &State);
+                          ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                          CCState &State);
 
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index e92e00f..0b68ba1 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -1374,7 +1374,10 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value *> &Args,
   unsigned LinkageSize = Subtarget->getFrameLowering()->getLinkageSize();
   CCInfo.AllocateStack(LinkageSize, Align(8));
 
-  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
+  SmallVector<Type *, 16> ArgTys;
+  for (Value *Arg : Args)
+    ArgTys.push_back(Arg->getType());
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, ArgTys, CC_PPC64_ELF_FIS);
 
   // Bail out if we can't handle any of the arguments.
   for (const CCValAssign &VA : ArgLocs) {
@@ -1487,7 +1490,7 @@ bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumByte
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    CCInfo.AnalyzeCallResult(RetVT, CLI.RetTy, RetCC_PPC64_ELF_FIS);
     CCValAssign &VA = RVLocs[0];
     assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
     assert(VA.isRegLoc() && "Can only return in registers!");
@@ -1573,7 +1576,7 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
       RetVT != MVT::f64) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    CCInfo.AnalyzeCallResult(RetVT, RetTy, RetCC_PPC64_ELF_FIS);
     if (RVLocs.size() > 1)
       return false;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index c0860fc..2ad3ed2 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2078,8 +2078,10 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
   // tail call might not be in the new RestoreBlock, so real branch instruction
   // won't be generated by emitEpilogue(), because shrink-wrap has chosen new
   // RestoreBlock. So we handle this case here.
-  if (MFI.getSavePoint() && MFI.hasTailCall()) {
-    MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+  if (!MFI.getSavePoints().empty() && MFI.hasTailCall()) {
+    assert(MFI.getRestorePoints().size() < 2 &&
+           "MFI can't contain multiple restore points!");
+    MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front();
     for (MachineBasicBlock &MBB : MF) {
       if (MBB.isReturnBlock() && (&MBB) != RestoreBlock)
         createTailCallBranchInstr(MBB);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2698bd6..652edd4 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1787,11 +1787,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
   case PPCISD::PADDI_DTPREL:
     return "PPCISD::PADDI_DTPREL";
-  case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
-  case PPCISD::SC:              return "PPCISD::SC";
-  case PPCISD::CLRBHRB:         return "PPCISD::CLRBHRB";
-  case PPCISD::MFBHRBE:         return "PPCISD::MFBHRBE";
-  case PPCISD::RFEBB:           return "PPCISD::RFEBB";
+  case PPCISD::VADD_SPLAT:
+    return "PPCISD::VADD_SPLAT";
   case PPCISD::XXSWAPD:         return "PPCISD::XXSWAPD";
   case PPCISD::SWAP_NO_CHAIN:   return "PPCISD::SWAP_NO_CHAIN";
   case PPCISD::BUILD_FP128:     return "PPCISD::BUILD_FP128";
@@ -4051,18 +4048,13 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-
-  Entry.Ty = IntPtrTy;
-  Entry.Node = Trmp; Args.push_back(Entry);
-
+  Args.emplace_back(Trmp, IntPtrTy);
   // TrampSize == (isPPC64 ? 48 : 40);
-  Entry.Node =
-      DAG.getConstant(isPPC64 ? 48 : 40, dl, Subtarget.getScalarIntVT());
-  Args.push_back(Entry);
-
-  Entry.Node = FPtr; Args.push_back(Entry);
-  Entry.Node = Nest; Args.push_back(Entry);
+  Args.emplace_back(
+      DAG.getConstant(isPPC64 ? 48 : 40, dl, Subtarget.getScalarIntVT()),
+      IntPtrTy);
+  Args.emplace_back(FPtr, IntPtrTy);
+  Args.emplace_back(Nest, IntPtrTy);
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -6091,10 +6083,10 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
 
       if (!ArgFlags.isVarArg()) {
         Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
-                               CCInfo);
+                               Outs[i].OrigTy, CCInfo);
       } else {
         Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
-                                      ArgFlags, CCInfo);
+                                      ArgFlags, Outs[i].OrigTy, CCInfo);
       }
 
       if (Result) {
@@ -6905,7 +6897,7 @@ static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
 
 static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                   CCState &State) {
+                   Type *OrigTy, CCState &State) {
   const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
       State.getMachineFunction().getSubtarget());
   const bool IsPPC64 = Subtarget.isPPC64();
@@ -14822,9 +14814,9 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
   SDValue Chain = LD->getChain();
   EVT VT = LD->getMemoryVT();
 
-  SmallSet<SDNode *, 16> LoadRoots;
+  SmallPtrSet<SDNode *, 16> LoadRoots;
   SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
-  SmallSet<SDNode *, 16> Visited;
+  SmallPtrSet<SDNode *, 16> Visited;
 
   // First, search up the chain, branching to follow all token-factor operands.
   // If we find a consecutive load, then we're done, otherwise, record all
@@ -19553,12 +19545,10 @@ SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
       DAG.getExternalSymbol(LibCallName, TLI.getPointerTy(DAG.getDataLayout()));
   bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetTy, false);
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   for (const SDValue &N : Op->op_values()) {
     EVT ArgVT = N.getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-    Entry.Node = N;
-    Entry.Ty = ArgTy;
+    TargetLowering::ArgListEntry Entry(N, ArgTy);
     Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, SignExtend);
     Entry.IsZExt = !Entry.IsSExt;
     Args.push_back(Entry);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 9755f0e..5e0d6bf 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -430,20 +430,6 @@ namespace llvm {
     /// optimizations due to constant folding.
     VADD_SPLAT,
 
-    /// CHAIN = SC CHAIN, Imm128 - System call.  The 7-bit unsigned
-    /// operand identifies the operating system entry point.
-    SC,
-
-    /// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer.
-    CLRBHRB,
-
-    /// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch
-    /// history rolling buffer entry.
-    MFBHRBE,
-
-    /// CHAIN = RFEBB CHAIN, State - Return from event-based branch.
-    RFEBB,
-
     /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
     /// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
     /// or stxvd2x instruction.  The chain is necessary because the
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 24287a9..79fe12e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1630,9 +1630,11 @@ def BCDCTSQ_rec : VX_VT5_EO5_VB5_XO9_o    <0, 385, "bcdctsq.", []>;
 
 // Decimal Copy-Sign/Set-Sign
 let Defs = [CR6] in
-def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>;
+def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.",
+    [(set v16i8:$VD, (int_ppc_bcdcopysign v16i8:$VA, v16i8:$VB))]>;
 
-def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>;
+def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.",
+    [(set v16i8:$VD, (int_ppc_bcdsetsign v16i8:$VB, i32:$PS))]>;
 
 // Decimal Shift/Unsigned-Shift/Shift-and-Round
 def BCDS_rec :  VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 99ef89a..c2f91ce 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -365,16 +365,6 @@ def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
                                 SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
 
-def SDT_PPCsc     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def PPCsc         : SDNode<"PPCISD::SC", SDT_PPCsc,
-                           [SDNPHasChain, SDNPSideEffect]>;
-
-def PPCclrbhrb    : SDNode<"PPCISD::CLRBHRB", SDTNone,
-                           [SDNPHasChain, SDNPSideEffect]>;
-def PPCmfbhrbe    : SDNode<"PPCISD::MFBHRBE", SDTIntBinOp, [SDNPHasChain]>;
-def PPCrfebb      : SDNode<"PPCISD::RFEBB", SDT_PPCsc,
-                           [SDNPHasChain, SDNPSideEffect]>;
-
 def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
 def PPCvcmp_rec   : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>;
 
@@ -1673,7 +1663,7 @@ let isBranch = 1, isTerminator = 1, Size = 0 in {
 // System call.
 let PPC970_Unit = 7 in {
   def SC     : SCForm<17, 1, 0, (outs), (ins i32imm:$LEV),
-                      "sc $LEV", IIC_BrB, [(PPCsc (i32 imm:$LEV))]>;
+                      "sc $LEV", IIC_BrB, []>;
 }
 
 // We mark SCV as having no scheduling model since it is only meant to be used
@@ -1685,21 +1675,14 @@ let Predicates = [IsISA3_0], hasNoSchedulingInfo = 1 in {
 }
 
 // Branch history rolling buffer.
-def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB,
-                      [(PPCclrbhrb)]>,
+def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB, []>,
                       PPC970_DGroup_Single;
-// The $dmy argument used for MFBHRBE is not needed; however, including
-// it avoids automatic generation of PPCFastISel::fastEmit_i(), which
-// interferes with necessary special handling (see PPCFastISel.cpp).
-def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$RT),
-                         (ins u10imm:$imm, u10imm:$dmy),
-                         "mfbhrbe $RT, $imm", IIC_BrB,
-                         [(set i32:$RT,
-                               (PPCmfbhrbe imm:$imm, imm:$dmy))]>,
+
+def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$RT), (ins u10imm:$imm),
+                         "mfbhrbe $RT, $imm", IIC_BrB, []>,
                          PPC970_DGroup_First;
 
-def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$S), "rfebb $S",
-                     IIC_BrB, [(PPCrfebb (i32 imm:$S))]>,
+def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$S), "rfebb $S", IIC_BrB, []>,
                      PPC970_DGroup_Single;
 
 def : InstAlias<"rfebb", (RFEBB 1)>;
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 709d7e7..adf9436 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -264,9 +264,8 @@ namespace {
     bool prepareBasesForCommoningChains(Bucket &BucketChain);
 
     /// Rewrite load/store according to the common chains.
-    bool
-    rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket,
-                                        SmallSet<BasicBlock *, 16> &BBChanged);
+    bool rewriteLoadStoresForCommoningChains(
+        Loop *L, Bucket &Bucket, SmallPtrSet<BasicBlock *, 16> &BBChanged);
 
     /// Collect condition matched(\p isValidCandidate() returns true)
     /// candidates in Loop \p L.
@@ -309,7 +308,7 @@ namespace {
     /// Rewrite load/store instructions in \p BucketChain according to
     /// preparation.
     bool rewriteLoadStores(Loop *L, Bucket &BucketChain,
-                           SmallSet<BasicBlock *, 16> &BBChanged,
+                           SmallPtrSet<BasicBlock *, 16> &BBChanged,
                            PrepForm Form);
 
     /// Rewrite for the base load/store of a chain.
@@ -523,7 +522,7 @@ bool PPCLoopInstrFormPrep::chainCommoning(Loop *L,
   if (Buckets.empty())
     return MadeChange;
 
-  SmallSet<BasicBlock *, 16> BBChanged;
+  SmallPtrSet<BasicBlock *, 16> BBChanged;
 
   for (auto &Bucket : Buckets) {
     if (prepareBasesForCommoningChains(Bucket))
@@ -537,7 +536,7 @@ bool PPCLoopInstrFormPrep::chainCommoning(Loop *L,
 }
 
 bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
-    Loop *L, Bucket &Bucket, SmallSet<BasicBlock *, 16> &BBChanged) {
+    Loop *L, Bucket &Bucket, SmallPtrSet<BasicBlock *, 16> &BBChanged) {
   bool MadeChange = false;
 
   assert(Bucket.Elements.size() ==
@@ -1006,7 +1005,7 @@ bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
 }
 
 bool PPCLoopInstrFormPrep::rewriteLoadStores(
-    Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged,
+    Loop *L, Bucket &BucketChain, SmallPtrSet<BasicBlock *, 16> &BBChanged,
     PrepForm Form) {
   bool MadeChange = false;
 
@@ -1089,7 +1088,7 @@ bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L,
   bool MadeChange = false;
   if (Buckets.empty())
     return MadeChange;
-  SmallSet<BasicBlock *, 16> BBChanged;
+  SmallPtrSet<BasicBlock *, 16> BBChanged;
   for (auto &Bucket : Buckets)
     // The base address of each bucket is transformed into a phi and the others
     // are rewritten based on new base.
@@ -1110,7 +1109,7 @@ bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L,
   if (Buckets.empty())
     return MadeChange;
 
-  SmallSet<BasicBlock *, 16> BBChanged;
+  SmallPtrSet<BasicBlock *, 16> BBChanged;
   for (auto &Bucket : Buckets) {
     if (Bucket.Elements.size() < DispFormPrepMinThreshold)
       continue;
diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 0ffd35d..74bce43 100644
--- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -248,6 +248,10 @@ static bool splitMBB(BlockSplitInfo &BSI) {
   }
   addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI);
 
+  // Set the call frame size on ThisMBB to the new basic blocks.
+  // See https://reviews.llvm.org/D156113.
+  NewMBB->setCallFrameSize(TII->getCallFrameSizeAt(ThisMBB->back()));
+
   LLVM_DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump());
   LLVM_DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump());
   LLVM_DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump());
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 76dca47..f123040 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1102,13 +1102,20 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
     SpillsKnownBit = true;
     break;
   default:
+    // When spilling a CR bit, the super register may not be explicitly defined
+    // (i.e. it can be defined by a CR-logical that only defines the subreg) so
+    // we state that the CR field is undef. Also, in order to preserve the kill
+    // flag on the CR bit, we add it as an implicit use.
+
     // On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all
     // bits (specifically, it produces a -1 if the CR bit is set). Ultimately,
     // the bit that is of importance to us is bit 32 (bit 0 of a 32-bit
     // register), and SETNBC will set this.
     if (Subtarget.isISA3_1()) {
       BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg)
-          .addReg(SrcReg, RegState::Undef);
+          .addReg(SrcReg, RegState::Undef)
+          .addReg(SrcReg, RegState::Implicit |
+                              getKillRegState(MI.getOperand(0).isKill()));
       break;
     }
 
@@ -1122,16 +1129,14 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
           SrcReg == PPC::CR4LT || SrcReg == PPC::CR5LT ||
           SrcReg == PPC::CR6LT || SrcReg == PPC::CR7LT) {
         BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETB8 : PPC::SETB), Reg)
-          .addReg(getCRFromCRBit(SrcReg), RegState::Undef);
+            .addReg(getCRFromCRBit(SrcReg), RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit |
+                                getKillRegState(MI.getOperand(0).isKill()));
         break;
       }
     }
 
     // We need to move the CR field that contains the CR bit we are spilling.
-    // The super register may not be explicitly defined (i.e. it can be defined
-    // by a CR-logical that only defines the subreg) so we state that the CR
-    // field is undef. Also, in order to preserve the kill flag on the CR bit,
-    // we add it as an implicit use.
     BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
       .addReg(getCRFromCRBit(SrcReg), RegState::Undef)
       .addReg(SrcReg,
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index d71c42c..d37ae2f 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -903,6 +903,7 @@ public:
            VK == RISCV::S_QC_ABS20;
   }
 
+  bool isSImm8Unsigned() const { return isSImm<8>() || isUImm<8>(); }
   bool isSImm10Unsigned() const { return isSImm<10>() || isUImm<10>(); }
 
   bool isUImm20LUI() const {
@@ -1199,6 +1200,14 @@ public:
     addExpr(Inst, getImm(), isRV64Imm());
   }
 
+  void addSImm8UnsignedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    int64_t Imm;
+    [[maybe_unused]] bool IsConstant = evaluateConstantImm(getImm(), Imm);
+    assert(IsConstant);
+    Inst.addOperand(MCOperand::createImm(SignExtend64<8>(Imm)));
+  }
+
   void addSImm10UnsignedOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     int64_t Imm;
@@ -1547,6 +1556,9 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, 0, (1 << 9) - 8,
         "immediate must be a multiple of 8 bytes in the range");
+  case Match_InvalidSImm8Unsigned:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 7),
+                                      (1 << 8) - 1);
   case Match_InvalidSImm10:
     return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 9),
                                       (1 << 9) - 1);
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index e0ac591..78be55b 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -672,6 +672,8 @@ static constexpr FeatureBitset XAndesGroup = {
     RISCV::FeatureVendorXAndesVSIntLoad, RISCV::FeatureVendorXAndesVPackFPH,
     RISCV::FeatureVendorXAndesVDot};
 
+static constexpr FeatureBitset XSMTGroup = {RISCV::FeatureVendorXSMTVDot};
+
 static constexpr DecoderListEntry DecoderList32[]{
     // Vendor Extensions
     {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"},
@@ -692,6 +694,7 @@ static constexpr DecoderListEntry DecoderList32[]{
      {RISCV::FeatureVendorXMIPSCBOP},
      "MIPS mips.pref"},
     {DecoderTableXAndes32, XAndesGroup, "Andes extensions"},
+    {DecoderTableXSMT32, XSMTGroup, "SpacemiT extensions"},
     // Standard Extensions
     {DecoderTable32, {}, "standard 32-bit instructions"},
     {DecoderTableRV32Only32, {}, "RV32-only standard 32-bit instructions"},
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index f83c2b6..51ea3fc 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -736,7 +736,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
   }
   case TargetOpcode::G_FCONSTANT: {
     // TODO: Use constant pool for complex constants.
-    // TODO: Optimize +0.0 to use fcvt.d.w for s64 on rv32.
     Register DstReg = MI.getOperand(0).getReg();
     const APFloat &FPimm = MI.getOperand(1).getFPImm()->getValueAPF();
     APInt Imm = FPimm.bitcastToAPInt();
@@ -753,8 +752,22 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
       if (!FMV.constrainAllUses(TII, TRI, RBI))
         return false;
     } else {
+      // s64 on rv32
       assert(Size == 64 && !Subtarget->is64Bit() &&
              "Unexpected size or subtarget");
+
+      if (Imm.isNonNegative() && Imm.isZero()) {
+        // Optimize +0.0 to use fcvt.d.w
+        MachineInstrBuilder FCVT =
+            MIB.buildInstr(RISCV::FCVT_D_W, {DstReg}, {Register(RISCV::X0)})
+                .addImm(RISCVFPRndMode::RNE);
+        if (!FCVT.constrainAllUses(TII, TRI, RBI))
+          return false;
+
+        MI.eraseFromParent();
+        return true;
+      }
+
       // Split into two pieces and build through the stack.
       Register GPRRegHigh = MRI->createVirtualRegister(&RISCV::GPRRegClass);
       Register GPRRegLow = MRI->createVirtualRegister(&RISCV::GPRRegClass);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 8d956ce..96f22c2 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -819,6 +819,23 @@ void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
   Asm->getWriter().recordRelocation(F, VendorFixup, VendorTarget, VendorValue);
 }
 
+static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) {
+  // Some Fixups are marked as LinkerRelaxable by
+  // `RISCVMCCodeEmitter::getImmOpValue` only because they may be
+  // (assembly-)relaxed into a linker-relaxable instruction. This function
+  // should return `false` for those fixups so they do not get a `R_RISCV_RELAX`
+  // relocation emitted in addition to the relocation.
+  switch (Kind) {
+  default:
+    break;
+  case RISCV::fixup_riscv_rvc_jump:
+  case RISCV::fixup_riscv_rvc_branch:
+  case RISCV::fixup_riscv_jal:
+    return false;
+  }
+  return true;
+}
+
 bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
                                const MCValue &Target, uint64_t &FixedValue,
                                bool IsResolved) {
@@ -861,25 +878,32 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
     return false;
   }
 
-  // If linker relaxation is enabled and supported by the current relocation,
-  // generate a relocation and then append a RELAX.
-  if (Fixup.isLinkerRelaxable())
+  // If linker relaxation is enabled and supported by the current fixup, then we
+  // always want to generate a relocation.
+  bool NeedsRelax = Fixup.isLinkerRelaxable() &&
+                    relaxableFixupNeedsRelocation(Fixup.getKind());
+  if (NeedsRelax)
     IsResolved = false;
+
   if (IsResolved && Fixup.isPCRel())
     IsResolved = isPCRelFixupResolved(Target.getAddSym(), F);
 
   if (!IsResolved) {
-    // Some Fixups require a vendor relocation, record it (directly) before we
+    // Some Fixups require a VENDOR relocation, record it (directly) before we
     // add the relocation.
     maybeAddVendorReloc(F, Fixup);
 
     Asm->getWriter().recordRelocation(F, Fixup, Target, FixedValue);
-  }
 
-  if (Fixup.isLinkerRelaxable()) {
-    auto FA = MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX);
-    Asm->getWriter().recordRelocation(F, FA, MCValue::get(nullptr),
-                                      FixedValueA);
+    if (NeedsRelax) {
+      // Some Fixups get a RELAX relocation, record it (directly) after we add
+      // the relocation.
+      MCFixup RelaxFixup =
+          MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX);
+      MCValue RelaxTarget = MCValue::get(nullptr);
+      uint64_t RelaxValue;
+      Asm->getWriter().recordRelocation(F, RelaxFixup, RelaxTarget, RelaxValue);
+    }
   }
 
   return false;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index bddea43..083ac05 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -139,6 +139,9 @@ enum {
   // 3 -> SEW * 4
   DestEEWShift = ElementsDependOnMaskShift + 1,
   DestEEWMask = 3ULL << DestEEWShift,
+
+  ReadsPastVLShift = DestEEWShift + 2,
+  ReadsPastVLMask = 1ULL << ReadsPastVLShift,
 };
 
 // Helper functions to read TSFlags.
@@ -195,6 +198,12 @@ static inline bool elementsDependOnMask(uint64_t TSFlags) {
   return TSFlags & ElementsDependOnMaskMask;
 }
 
+/// \returns true if the instruction may read elements past VL, e.g.
+/// vslidedown/vrgather
+static inline bool readsPastVL(uint64_t TSFlags) {
+  return TSFlags & ReadsPastVLMask;
+}
+
 static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
   const uint64_t TSFlags = Desc.TSFlags;
   // This method is only called if we expect to have a VL operand, and all
@@ -337,6 +346,7 @@ enum OperandType : unsigned {
   OPERAND_SIMM5_PLUS1,
   OPERAND_SIMM6,
   OPERAND_SIMM6_NONZERO,
+  OPERAND_SIMM8,
   OPERAND_SIMM10,
   OPERAND_SIMM10_LSB0000_NONZERO,
   OPERAND_SIMM11,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 8c9ab8e..b0c27ce 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -75,7 +75,7 @@ void RISCVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   if (PrintAliases && !NoAliases)
     Res = RISCVRVC::uncompress(UncompressedMI, *MI, STI);
   if (Res)
-    NewMI = const_cast<MCInst *>(&UncompressedMI);
+    NewMI = &UncompressedMI;
   if (!PrintAliases || NoAliases || !printAliasInstr(NewMI, Address, STI, O))
     printInstruction(NewMI, Address, STI, O);
   printAnnotation(O, Annot);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index cbeabdd..717fba6 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -576,8 +576,21 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
          "getImmOpValue expects only expressions or immediates");
   const MCExpr *Expr = MO.getExpr();
   MCExpr::ExprKind Kind = Expr->getKind();
-  unsigned FixupKind = RISCV::fixup_riscv_invalid;
+
+  // `RelaxCandidate` must be set to `true` in two cases:
+  // - The fixup's relocation gets a R_RISCV_RELAX relocation
+  // - The underlying instruction may be relaxed to an instruction that gets a
+  //   `R_RISCV_RELAX` relocation.
+  //
+  // The actual emission of `R_RISCV_RELAX` will be handled in
+  // `RISCVAsmBackend::applyFixup`.
   bool RelaxCandidate = false;
+  auto AsmRelaxToLinkerRelaxableWithFeature = [&](unsigned Feature) -> void {
+    if (!STI.hasFeature(RISCV::FeatureExactAssembly) && STI.hasFeature(Feature))
+      RelaxCandidate = true;
+  };
+
+  unsigned FixupKind = RISCV::fixup_riscv_invalid;
   if (Kind == MCExpr::Specifier) {
     const auto *RVExpr = cast<MCSpecifierExpr>(Expr);
     FixupKind = RVExpr->getSpecifier();
@@ -644,18 +657,26 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
     // FIXME: Sub kind binary exprs have chance of underflow.
     if (MIFrm == RISCVII::InstFormatJ) {
       FixupKind = RISCV::fixup_riscv_jal;
+      AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb);
     } else if (MIFrm == RISCVII::InstFormatB) {
       FixupKind = RISCV::fixup_riscv_branch;
+      // This might be assembler relaxed to `b<cc>; jal` but we cannot relax
+      // the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatCJ) {
       FixupKind = RISCV::fixup_riscv_rvc_jump;
+      AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb);
     } else if (MIFrm == RISCVII::InstFormatCB) {
       FixupKind = RISCV::fixup_riscv_rvc_branch;
+      // This might be assembler relaxed to `b<cc>; jal` but we cannot relax
+      // the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatCI) {
       FixupKind = RISCV::fixup_riscv_rvc_imm;
     } else if (MIFrm == RISCVII::InstFormatI) {
       FixupKind = RISCV::fixup_riscv_12_i;
     } else if (MIFrm == RISCVII::InstFormatQC_EB) {
       FixupKind = RISCV::fixup_riscv_qc_e_branch;
+      // This might be assembler relaxed to `qc.e.b<cc>; jal` but we cannot
+      // relax the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatQC_EAI) {
       FixupKind = RISCV::fixup_riscv_qc_e_32;
       RelaxCandidate = true;
@@ -670,9 +691,9 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
   assert(FixupKind != RISCV::fixup_riscv_invalid && "Unhandled expression!");
 
   addFixup(Fixups, 0, Expr, FixupKind);
-  // If linker relaxation is enabled and supported by this relocation, set
-  // a bit so that if fixup is unresolved, a R_RISCV_RELAX relocation will be
-  // appended.
+  // If linker relaxation is enabled and supported by this relocation, set a bit
+  // so that the assembler knows the size of the instruction is not fixed/known,
+  // and the relocation will need a R_RISCV_RELAX relocation.
   if (EnableRelax && RelaxCandidate)
     Fixups.back().setLinkerRelaxable();
   ++MCNumFixups;
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index 70127e3..78f4779 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -741,7 +741,7 @@ bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
 
 bool llvm::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                         CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                        CCState &State) {
+                        Type *OrigTy, CCState &State) {
   if (ArgFlags.isNest()) {
     report_fatal_error(
         "Attribute 'nest' is not supported in GHC calling convention");
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h
index 2030ce1..0847dd6 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.h
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h
@@ -33,7 +33,7 @@ bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
 
 bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                   CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                  CCState &State);
+                  Type *OrigTy, CCState &State);
 
 namespace RISCV {
 
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index a7329d2..d4ac3c6 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1055,13 +1055,13 @@ def FeatureStdExtSupm
                      "Indicates User-mode Pointer Masking">;
 
 def FeatureStdExtSmctr
-    : RISCVExperimentalExtension<1, 0,
-                                 "Control Transfer Records Machine Level",
-                                 [FeatureStdExtSscsrind]>;
+    : RISCVExtension<1, 0,
+                     "Control Transfer Records Machine Level",
+                     [FeatureStdExtSscsrind]>;
 def FeatureStdExtSsctr
-    : RISCVExperimentalExtension<1, 0,
-                                 "Control Transfer Records Supervisor Level",
-                                 [FeatureStdExtSscsrind]>;
+    : RISCVExtension<1, 0,
+                     "Control Transfer Records Supervisor Level",
+                     [FeatureStdExtSscsrind]>;
 def HasStdExtSmctrOrSsctr : Predicate<"Subtarget->hasStdExtSmctrOrSsctr()">,
                             AssemblerPredicate<(any_of FeatureStdExtSmctr, FeatureStdExtSsctr),
                                "'Smctr' (Control Transfer Records Machine Level) or "
@@ -1642,6 +1642,14 @@ def HasVendorXAndesVDot
       AssemblerPredicate<(all_of FeatureVendorXAndesVDot),
                          "'XAndesVDot' (Andes Vector Dot Product Extension)">;
 
+def FeatureVendorXSMTVDot
+    : RISCVExtension<1, 0, "SpacemiT Vector Dot Product Extension",
+                     [FeatureStdExtZve32f]>;
+def HasVendorXSMTVDot
+    : Predicate<"Subtarget->hasVendorXSMTVDot()">,
+      AssemblerPredicate<(all_of FeatureVendorXSMTVDot),
+                         "'XSMTVDot' (SpacemiT Vector Dot Product Extension)">;
+
 //===----------------------------------------------------------------------===//
 // LLVM specific features and extensions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 5998653..f9f35f6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -18,6 +18,7 @@
 #include "RISCVInstrInfo.h"
 #include "RISCVSelectionDAGInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Debug.h"
@@ -681,40 +682,86 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
   if (!Subtarget->hasVendorXqcibm())
     return false;
 
-  auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
-  if (!N1C)
+  using namespace SDPatternMatch;
+
+  SDValue X;
+  APInt MaskImm;
+  if (!sd_match(Node, m_Or(m_OneUse(m_Value(X)), m_ConstInt(MaskImm))))
     return false;
 
-  int32_t C1 = N1C->getSExtValue();
-  if (!isShiftedMask_32(C1) || isInt<12>(C1))
+  unsigned ShAmt, Width;
+  if (!MaskImm.isShiftedMask(ShAmt, Width) || MaskImm.isSignedIntN(12))
     return false;
 
-  // INSBI will clobber the input register in N0. Bail out if we need a copy to
-  // preserve this value.
-  SDValue N0 = Node->getOperand(0);
-  if (!N0.hasOneUse())
+  // If Zbs is enabled and it is a single bit set we can use BSETI which
+  // can be compressed to C_BSETI when Xqcibm in enabled.
+  if (Width == 1 && Subtarget->hasStdExtZbs())
     return false;
 
   // If C1 is a shifted mask (but can't be formed as an ORI),
   // use a bitfield insert of -1.
   // Transform (or x, C1)
   //        -> (qc.insbi x, -1, width, shift)
-  const unsigned Leading = llvm::countl_zero((uint32_t)C1);
-  const unsigned Trailing = llvm::countr_zero((uint32_t)C1);
-  const unsigned Width = 32 - Leading - Trailing;
+  SDLoc DL(Node);
+  MVT VT = Node->getSimpleValueType(0);
 
-  // If Zbs is enabled and it is a single bit set we can use BSETI which
-  // can be compressed to C_BSETI when Xqcibm in enabled.
-  if (Width == 1 && Subtarget->hasStdExtZbs())
+  SDValue Ops[] = {X, CurDAG->getSignedTargetConstant(-1, DL, VT),
+                   CurDAG->getTargetConstant(Width, DL, VT),
+                   CurDAG->getTargetConstant(ShAmt, DL, VT)};
+  SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops);
+  ReplaceNode(Node, BitIns);
+  return true;
+}
+
+// Generate a QC_INSB/QC_INSBI from 'or (and X, MaskImm), OrImm' iff the value
+// being inserted only sets known zero bits.
+bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromOrAndImm(SDNode *Node) {
+  // Supported only in Xqcibm for now.
+  if (!Subtarget->hasVendorXqcibm())
+    return false;
+
+  using namespace SDPatternMatch;
+
+  SDValue And;
+  APInt MaskImm, OrImm;
+  if (!sd_match(Node, m_Or(m_OneUse(m_And(m_Value(And), m_ConstInt(MaskImm))),
+                           m_ConstInt(OrImm))))
+    return false;
+
+  // Compute the Known Zero for the AND as this allows us to catch more general
+  // cases than just looking for AND with imm.
+  KnownBits Known = CurDAG->computeKnownBits(Node->getOperand(0));
+
+  // The bits being inserted must only set those bits that are known to be zero.
+  if (!OrImm.isSubsetOf(Known.Zero)) {
+    // FIXME:  It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
+    // currently handle this case.
+    return false;
+  }
+
+  unsigned ShAmt, Width;
+  // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
+  if (!Known.Zero.isShiftedMask(ShAmt, Width))
     return false;
 
+  // QC_INSB(I) dst, src, #width, #shamt.
   SDLoc DL(Node);
   MVT VT = Node->getSimpleValueType(0);
+  SDValue ImmNode;
+  auto Opc = RISCV::QC_INSB;
 
-  SDValue Ops[] = {N0, CurDAG->getSignedTargetConstant(-1, DL, VT),
-                   CurDAG->getTargetConstant(Width, DL, VT),
-                   CurDAG->getTargetConstant(Trailing, DL, VT)};
-  SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops);
+  int32_t LIImm = OrImm.getSExtValue() >> ShAmt;
+
+  if (isInt<5>(LIImm)) {
+    Opc = RISCV::QC_INSBI;
+    ImmNode = CurDAG->getSignedTargetConstant(LIImm, DL, MVT::i32);
+  } else {
+    ImmNode = selectImm(CurDAG, DL, MVT::i32, LIImm, *Subtarget);
+  }
+
+  SDValue Ops[] = {And, ImmNode, CurDAG->getTargetConstant(Width, DL, VT),
+                   CurDAG->getTargetConstant(ShAmt, DL, VT)};
+  SDNode *BitIns = CurDAG->getMachineNode(Opc, DL, VT, Ops);
   ReplaceNode(Node, BitIns);
   return true;
 }
@@ -772,6 +819,49 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
   return false;
 }
 
+// (xor X, (and (xor X, C1), C2))
+// -> (qc.insbi X, (C1 >> ShAmt), Width, ShAmt)
+// where C2 is a shifted mask with width=Width and shift=ShAmt
+bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromXor(SDNode *Node) {
+
+  if (!Subtarget->hasVendorXqcibm())
+    return false;
+
+  using namespace SDPatternMatch;
+
+  SDValue X;
+  APInt CImm, CMask;
+  if (!sd_match(
+          Node,
+          m_Xor(m_Value(X),
+                m_OneUse(m_And(m_OneUse(m_Xor(m_Deferred(X), m_ConstInt(CImm))),
+                               m_ConstInt(CMask))))))
+    return false;
+
+  unsigned Width, ShAmt;
+  if (!CMask.isShiftedMask(ShAmt, Width))
+    return false;
+
+  int64_t Imm = CImm.getSExtValue();
+  Imm >>= ShAmt;
+
+  SDLoc DL(Node);
+  SDValue ImmNode;
+  auto Opc = RISCV::QC_INSB;
+
+  if (isInt<5>(Imm)) {
+    Opc = RISCV::QC_INSBI;
+    ImmNode = CurDAG->getSignedTargetConstant(Imm, DL, MVT::i32);
+  } else {
+    ImmNode = selectImm(CurDAG, DL, MVT::i32, Imm, *Subtarget);
+  }
+  SDValue Ops[] = {X, ImmNode, CurDAG->getTargetConstant(Width, DL, MVT::i32),
+                   CurDAG->getTargetConstant(ShAmt, DL, MVT::i32)};
+  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, MVT::i32, Ops));
+
+  return true;
+}
+
 bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node,
                                                    const SDLoc &DL, MVT VT,
                                                    SDValue X, unsigned Msb,
@@ -1340,6 +1430,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (trySignedBitfieldInsertInMask(Node))
       return;
 
+    if (tryBitfieldInsertOpFromOrAndImm(Node))
+      return;
+
     if (tryShrinkShlLogicImm(Node))
       return;
 
@@ -1349,6 +1442,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (tryShrinkShlLogicImm(Node))
       return;
 
+    if (tryBitfieldInsertOpFromXor(Node))
+      return;
+
     break;
   case ISD::AND: {
     auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
@@ -1644,7 +1740,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // available.
     // Transform (and x, C1)
     //        -> (<bfextract> x, msb, lsb)
-    if (isMask_64(C1) && !isInt<12>(N1C->getSExtValue())) {
+    if (isMask_64(C1) && !isInt<12>(N1C->getSExtValue()) &&
+        !(C1 == 0xffff && Subtarget->hasStdExtZbb()) &&
+        !(C1 == 0xffffffff && Subtarget->hasStdExtZba())) {
       const unsigned Msb = llvm::bit_width(C1) - 1;
       if (tryUnsignedBitfieldExtract(Node, DL, VT, N0, Msb, 0))
         return;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index ee3a86e..c329a4c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -75,6 +75,8 @@ public:
   bool trySignedBitfieldExtract(SDNode *Node);
   bool trySignedBitfieldInsertInSign(SDNode *Node);
   bool trySignedBitfieldInsertInMask(SDNode *Node);
+  bool tryBitfieldInsertOpFromXor(SDNode *Node);
+  bool tryBitfieldInsertOpFromOrAndImm(SDNode *Node);
   bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT,
                                   SDValue X, unsigned Msb, unsigned Lsb);
   bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e4aa8b8..4a1db80 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1844,6 +1844,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
                                /*IsStore*/ true,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
+  case Intrinsic::riscv_sseg2_store_mask:
+  case Intrinsic::riscv_sseg3_store_mask:
+  case Intrinsic::riscv_sseg4_store_mask:
+  case Intrinsic::riscv_sseg5_store_mask:
+  case Intrinsic::riscv_sseg6_store_mask:
+  case Intrinsic::riscv_sseg7_store_mask:
+  case Intrinsic::riscv_sseg8_store_mask:
+    // Operands are (vec, ..., vec, ptr, offset, mask, vl)
+    return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 4,
+                               /*IsStore*/ true,
+                               /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
   case Intrinsic::riscv_vlm:
     return SetRVVLoadStoreInfo(/*PtrOp*/ 0,
                                /*IsStore*/ false,
@@ -2512,11 +2523,11 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS,
       }
       break;
     case ISD::SETUGT:
-      if (Subtarget.hasVendorXqcibi() && C != INT64_MAX && isInt<16>(C + 1) &&
-          C != -1) {
+      if (Subtarget.hasVendorXqcibi() && C != INT64_MAX && isUInt<16>(C + 1)) {
         // We have a branch immediate instruction for SETUGE but not SETUGT.
-        // Convert X > C to X >= C + 1, if (C + 1) is a 16-bit signed immediate.
-        RHS = DAG.getSignedConstant(C + 1, DL, RHS.getValueType());
+        // Convert X > C to X >= C + 1, if (C + 1) is a 16-bit unsigned
+        // immediate.
+        RHS = DAG.getConstant(C + 1, DL, RHS.getValueType());
         CC = ISD::SETUGE;
         return;
       }
@@ -8931,10 +8942,7 @@ SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
 
   // Prepare argument list to generate call.
   ArgListTy Args;
-  ArgListEntry Entry;
-  Entry.Node = Load;
-  Entry.Ty = CallTy;
-  Args.push_back(Entry);
+  Args.emplace_back(Load, CallTy);
 
   // Setup call to __tls_get_addr.
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -11084,69 +11092,118 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   return lowerVectorIntrinsicScalars(Op, DAG, Subtarget);
 }
 
-SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  unsigned IntNo = Op.getConstantOperandVal(1);
+static SDValue
+lowerFixedVectorSegStoreIntrinsics(unsigned IntNo, SDValue Op,
+                                   const RISCVSubtarget &Subtarget,
+                                   SelectionDAG &DAG) {
+  bool IsStrided;
   switch (IntNo) {
-  default:
-    break;
   case Intrinsic::riscv_seg2_store_mask:
   case Intrinsic::riscv_seg3_store_mask:
   case Intrinsic::riscv_seg4_store_mask:
   case Intrinsic::riscv_seg5_store_mask:
   case Intrinsic::riscv_seg6_store_mask:
   case Intrinsic::riscv_seg7_store_mask:
-  case Intrinsic::riscv_seg8_store_mask: {
-    SDLoc DL(Op);
-    static const Intrinsic::ID VssegInts[] = {
-        Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
-        Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
-        Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
-        Intrinsic::riscv_vsseg8_mask};
+  case Intrinsic::riscv_seg8_store_mask:
+    IsStrided = false;
+    break;
+  case Intrinsic::riscv_sseg2_store_mask:
+  case Intrinsic::riscv_sseg3_store_mask:
+  case Intrinsic::riscv_sseg4_store_mask:
+  case Intrinsic::riscv_sseg5_store_mask:
+  case Intrinsic::riscv_sseg6_store_mask:
+  case Intrinsic::riscv_sseg7_store_mask:
+  case Intrinsic::riscv_sseg8_store_mask:
+    IsStrided = true;
+    break;
+  default:
+    llvm_unreachable("unexpected intrinsic ID");
+  }
 
-    // Operands: (chain, int_id, vec*, ptr, mask, vl)
-    unsigned NF = Op->getNumOperands() - 5;
-    assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
-    MVT XLenVT = Subtarget.getXLenVT();
-    MVT VT = Op->getOperand(2).getSimpleValueType();
-    MVT ContainerVT = getContainerForFixedLengthVector(VT);
-    unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
-                  ContainerVT.getScalarSizeInBits();
-    EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
+  SDLoc DL(Op);
+  static const Intrinsic::ID VssegInts[] = {
+      Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+      Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+      Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+      Intrinsic::riscv_vsseg8_mask};
+  static const Intrinsic::ID VsssegInts[] = {
+      Intrinsic::riscv_vssseg2_mask, Intrinsic::riscv_vssseg3_mask,
+      Intrinsic::riscv_vssseg4_mask, Intrinsic::riscv_vssseg5_mask,
+      Intrinsic::riscv_vssseg6_mask, Intrinsic::riscv_vssseg7_mask,
+      Intrinsic::riscv_vssseg8_mask};
+
+  // Operands: (chain, int_id, vec*, ptr, mask, vl) or
+  // (chain, int_id, vec*, ptr, stride, mask, vl)
+  unsigned NF = Op->getNumOperands() - (IsStrided ? 6 : 5);
+  assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
+  MVT XLenVT = Subtarget.getXLenVT();
+  MVT VT = Op->getOperand(2).getSimpleValueType();
+  MVT ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget);
+  unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
+                ContainerVT.getScalarSizeInBits();
+  EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
 
-    SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
-    SDValue Mask = Op.getOperand(Op.getNumOperands() - 2);
-    MVT MaskVT = Mask.getSimpleValueType();
-    MVT MaskContainerVT =
-        ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
-    Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+  SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+  SDValue Mask = Op.getOperand(Op.getNumOperands() - 2);
+  MVT MaskVT = Mask.getSimpleValueType();
+  MVT MaskContainerVT =
+      ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+  Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
 
-    SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT);
-    SDValue Ptr = Op->getOperand(NF + 2);
+  SDValue IntID = DAG.getTargetConstant(
+      IsStrided ? VsssegInts[NF - 2] : VssegInts[NF - 2], DL, XLenVT);
+  SDValue Ptr = Op->getOperand(NF + 2);
 
-    auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op);
+  auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op);
 
-    SDValue StoredVal = DAG.getUNDEF(VecTupTy);
-    for (unsigned i = 0; i < NF; i++)
-      StoredVal = DAG.getNode(
-          RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
-          convertToScalableVector(
-              ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget),
-          DAG.getTargetConstant(i, DL, MVT::i32));
+  SDValue StoredVal = DAG.getUNDEF(VecTupTy);
+  for (unsigned i = 0; i < NF; i++)
+    StoredVal = DAG.getNode(
+        RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
+        convertToScalableVector(ContainerVT, FixedIntrinsic->getOperand(2 + i),
+                                DAG, Subtarget),
+        DAG.getTargetConstant(i, DL, MVT::i32));
+
+  SmallVector<SDValue, 10> Ops = {
+      FixedIntrinsic->getChain(),
+      IntID,
+      StoredVal,
+      Ptr,
+      Mask,
+      VL,
+      DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
+  // Insert the stride operand.
+  if (IsStrided)
+    Ops.insert(std::next(Ops.begin(), 4),
+               Op.getOperand(Op.getNumOperands() - 3));
+
+  return DAG.getMemIntrinsicNode(
+      ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops,
+      FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand());
+}
+
+SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  unsigned IntNo = Op.getConstantOperandVal(1);
+  switch (IntNo) {
+  default:
+    break;
+  case Intrinsic::riscv_seg2_store_mask:
+  case Intrinsic::riscv_seg3_store_mask:
+  case Intrinsic::riscv_seg4_store_mask:
+  case Intrinsic::riscv_seg5_store_mask:
+  case Intrinsic::riscv_seg6_store_mask:
+  case Intrinsic::riscv_seg7_store_mask:
+  case Intrinsic::riscv_seg8_store_mask:
+  case Intrinsic::riscv_sseg2_store_mask:
+  case Intrinsic::riscv_sseg3_store_mask:
+  case Intrinsic::riscv_sseg4_store_mask:
+  case Intrinsic::riscv_sseg5_store_mask:
+  case Intrinsic::riscv_sseg6_store_mask:
+  case Intrinsic::riscv_sseg7_store_mask:
+  case Intrinsic::riscv_sseg8_store_mask:
+    return lowerFixedVectorSegStoreIntrinsics(IntNo, Op, Subtarget, DAG);
 
-    SDValue Ops[] = {
-        FixedIntrinsic->getChain(),
-        IntID,
-        StoredVal,
-        Ptr,
-        Mask,
-        VL,
-        DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
-
-    return DAG.getMemIntrinsicNode(
-        ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops,
-        FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand());
-  }
   case Intrinsic::riscv_sf_vc_xv_se:
     return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XV_SE);
   case Intrinsic::riscv_sf_vc_iv_se:
@@ -14273,7 +14330,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0));
     MakeLibCallOptions CallOptions;
     EVT OpVT = Op0.getValueType();
-    CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+    CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
     SDValue Result;
     std::tie(Result, Chain) =
@@ -14308,7 +14365,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
         Op0.getValueType() == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32;
     MakeLibCallOptions CallOptions;
     EVT OpVT = Op0.getValueType();
-    CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true);
+    CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64);
     SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first;
     Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result);
     Results.push_back(Result);
@@ -16531,8 +16588,10 @@ combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
 // (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
 // bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
 // can become a sext.w instead of a shift pair.
-static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performSETCCCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
                                    const RISCVSubtarget &Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -16548,6 +16607,20 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
           combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG, Subtarget))
     return V;
 
+  // (X & -4096) == 0 -> (X >> 12) == 0 if the AND constant can't use ANDI.
+  if (DCI.isAfterLegalizeDAG() && isNullConstant(N1) &&
+      N0.getOpcode() == ISD::AND && N0.hasOneUse() &&
+      isa<ConstantSDNode>(N0.getOperand(1))) {
+    const APInt &AndRHSC =
+        cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    if (!isInt<12>(AndRHSC.getSExtValue()) && AndRHSC.isNegatedPowerOf2()) {
+      unsigned ShiftBits = AndRHSC.countr_zero();
+      SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, N0.getOperand(0),
+                                  DAG.getConstant(ShiftBits, dl, VT));
+      return DAG.getSetCC(dl, VT, Shift, N1, Cond);
+    }
+  }
+
   if (OpVT != MVT::i64 || !Subtarget.is64Bit())
     return SDValue();
 
@@ -16582,27 +16655,39 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue
-performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
+performSIGN_EXTEND_INREGCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                 const RISCVSubtarget &Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
   SDValue Src = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT SrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
   unsigned Opc = Src.getOpcode();
+  SDLoc DL(N);
 
   // Fold (sext_inreg (fmv_x_anyexth X), i16) -> (fmv_x_signexth X)
   // Don't do this with Zhinx. We need to explicitly sign extend the GPR.
   if (Opc == RISCVISD::FMV_X_ANYEXTH && SrcVT.bitsGE(MVT::i16) &&
       Subtarget.hasStdExtZfhmin())
-    return DAG.getNode(RISCVISD::FMV_X_SIGNEXTH, SDLoc(N), VT,
-                       Src.getOperand(0));
+    return DAG.getNode(RISCVISD::FMV_X_SIGNEXTH, DL, VT, Src.getOperand(0));
 
   // Fold (sext_inreg (shl X, Y), i32) -> (sllw X, Y) iff Y u< 32
   if (Opc == ISD::SHL && Subtarget.is64Bit() && SrcVT == MVT::i32 &&
       VT == MVT::i64 && !isa<ConstantSDNode>(Src.getOperand(1)) &&
       DAG.computeKnownBits(Src.getOperand(1)).countMaxActiveBits() <= 5)
-    return DAG.getNode(RISCVISD::SLLW, SDLoc(N), VT, Src.getOperand(0),
+    return DAG.getNode(RISCVISD::SLLW, DL, VT, Src.getOperand(0),
                        Src.getOperand(1));
 
+  // Fold (sext_inreg (setcc), i1) -> (sub 0, (setcc))
+  if (Opc == ISD::SETCC && SrcVT == MVT::i1 && DCI.isAfterLegalizeDAG())
+    return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
+
+  // Fold (sext_inreg (xor (setcc), -1), i1) -> (add (setcc), -1)
+  if (Opc == ISD::XOR && SrcVT == MVT::i1 &&
+      isAllOnesConstant(Src.getOperand(1)) &&
+      Src.getOperand(0).getOpcode() == ISD::SETCC && DCI.isAfterLegalizeDAG())
+    return DAG.getNode(ISD::ADD, DL, VT, Src.getOperand(0),
+                       DAG.getAllOnesConstant(DL, VT));
+
   return SDValue();
 }
 
@@ -17461,7 +17546,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
     return SDValue();
 
   SmallVector<SDNode *> Worklist;
-  SmallSet<SDNode *, 8> Inserted;
+  SmallPtrSet<SDNode *, 8> Inserted;
   Worklist.push_back(N);
   Inserted.insert(N);
   SmallVector<CombineResult> CombinesToApply;
@@ -20022,9 +20107,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     return SDValue();
   }
   case ISD::SETCC:
-    return performSETCCCombine(N, DAG, Subtarget);
+    return performSETCCCombine(N, DCI, Subtarget);
   case ISD::SIGN_EXTEND_INREG:
-    return performSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+    return performSIGN_EXTEND_INREGCombine(N, DCI, Subtarget);
   case ISD::ZERO_EXTEND:
     // Fold (zero_extend (fp_to_uint X)) to prevent forming fcvt+zexti32 during
     // type legalization. This is safe because fp_to_uint produces poison if
@@ -20580,10 +20665,11 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     // Combine store of vmv.x.s/vfmv.f.s to vse with VL of 1.
     // vfmv.f.s is represented as extract element from 0. Match it late to avoid
     // any illegal types.
-    if (Val.getOpcode() == RISCVISD::VMV_X_S ||
-        (DCI.isAfterLegalizeDAG() &&
-         Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-         isNullConstant(Val.getOperand(1)))) {
+    if ((Val.getOpcode() == RISCVISD::VMV_X_S ||
+         (DCI.isAfterLegalizeDAG() &&
+          Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isNullConstant(Val.getOperand(1)))) &&
+        Val.hasOneUse()) {
       SDValue Src = Val.getOperand(0);
       MVT VecVT = Src.getSimpleValueType();
       // VecVT should be scalable and memory VT should match the element type.
@@ -20673,12 +20759,22 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
         isNullConstant(Src.getOperand(1)) &&
         Src.getOperand(0).getValueType().isScalableVector()) {
       EVT VT = N->getValueType(0);
-      EVT SrcVT = Src.getOperand(0).getValueType();
-      assert(SrcVT.getVectorElementType() == VT.getVectorElementType());
+      SDValue EVSrc = Src.getOperand(0);
+      EVT EVSrcVT = EVSrc.getValueType();
+      assert(EVSrcVT.getVectorElementType() == VT.getVectorElementType());
       // Widths match, just return the original vector.
-      if (SrcVT == VT)
-        return Src.getOperand(0);
-      // TODO: Use insert_subvector/extract_subvector to change widen/narrow?
+      if (EVSrcVT == VT)
+        return EVSrc;
+      SDLoc DL(N);
+      // Width is narrower, using insert_subvector.
+      if (EVSrcVT.getVectorMinNumElements() < VT.getVectorMinNumElements()) {
+        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
+                           EVSrc,
+                           DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+      }
+      // Width is wider, using extract_subvector.
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, EVSrc,
+                         DAG.getConstant(0, DL, Subtarget.getXLenVT()));
     }
     [[fallthrough]];
   }
@@ -22270,20 +22366,12 @@ void RISCVTargetLowering::analyzeInputArgs(
     MachineFunction &MF, CCState &CCInfo,
     const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
     RISCVCCAssignFn Fn) const {
-  FunctionType *FType = MF.getFunction().getFunctionType();
-
   for (const auto &[Idx, In] : enumerate(Ins)) {
     MVT ArgVT = In.VT;
     ISD::ArgFlagsTy ArgFlags = In.Flags;
 
-    Type *ArgTy = nullptr;
-    if (IsRet)
-      ArgTy = FType->getReturnType();
-    else if (In.isOrigArg())
-      ArgTy = FType->getParamType(In.getOrigArgIndex());
-
     if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
-           ArgTy)) {
+           In.OrigTy)) {
       LLVM_DEBUG(dbgs() << "InputArg #" << Idx << " has unhandled type "
                         << ArgVT << '\n');
       llvm_unreachable(nullptr);
@@ -22298,10 +22386,9 @@ void RISCVTargetLowering::analyzeOutputArgs(
   for (const auto &[Idx, Out] : enumerate(Outs)) {
     MVT ArgVT = Out.VT;
     ISD::ArgFlagsTy ArgFlags = Out.Flags;
-    Type *OrigTy = CLI ? CLI->getArgs()[Out.OrigArgIndex].Ty : nullptr;
 
     if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
-           OrigTy)) {
+           Out.OrigTy)) {
       LLVM_DEBUG(dbgs() << "OutputArg #" << Idx << " has unhandled type "
                         << ArgVT << "\n");
       llvm_unreachable(nullptr);
@@ -23083,7 +23170,7 @@ bool RISCVTargetLowering::CanLowerReturn(
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo,
-                 /*IsRet=*/true, nullptr))
+                 /*IsRet=*/true, Outs[i].OrigTy))
       return false;
   }
   return true;
@@ -23343,6 +23430,12 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           &RISCV::VRN2M4RegClass}) {
       if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
         return std::make_pair(0U, RC);
+
+      if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) {
+        MVT ContainerVT = getContainerForFixedLengthVector(VT);
+        if (TRI->isTypeLegalForClass(*RC, ContainerVT))
+          return std::make_pair(0U, RC);
+      }
     }
   } else if (Constraint == "vd") {
     for (const auto *RC :
@@ -23356,10 +23449,24 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           &RISCV::VRN2M4NoV0RegClass}) {
       if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
         return std::make_pair(0U, RC);
+
+      if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) {
+        MVT ContainerVT = getContainerForFixedLengthVector(VT);
+        if (TRI->isTypeLegalForClass(*RC, ContainerVT))
+          return std::make_pair(0U, RC);
+      }
     }
   } else if (Constraint == "vm") {
     if (TRI->isTypeLegalForClass(RISCV::VMV0RegClass, VT.SimpleTy))
       return std::make_pair(0U, &RISCV::VMV0RegClass);
+
+    if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) {
+      MVT ContainerVT = getContainerForFixedLengthVector(VT);
+      // VT here might be coerced to vector with i8 elements, so we need to
+      // check if this is a M1 register here instead of checking VMV0RegClass.
+      if (TRI->isTypeLegalForClass(RISCV::VRRegClass, ContainerVT))
+        return std::make_pair(0U, &RISCV::VMV0RegClass);
+    }
   } else if (Constraint == "cr") {
     if (VT == MVT::f16 && Subtarget.hasStdExtZhinxmin())
       return std::make_pair(0U, &RISCV::GPRF16CRegClass);
@@ -24237,7 +24344,12 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts(
     return true;
   }
 
-  if (ValueVT.isScalableVector() && PartVT.isScalableVector()) {
+  if ((ValueVT.isScalableVector() || ValueVT.isFixedLengthVector()) &&
+      PartVT.isScalableVector()) {
+    if (ValueVT.isFixedLengthVector()) {
+      ValueVT = getContainerForFixedLengthVector(ValueVT.getSimpleVT());
+      Val = convertToScalableVector(ValueVT, Val, DAG, Subtarget);
+    }
     LLVMContext &Context = *DAG.getContext();
     EVT ValueEltVT = ValueVT.getVectorElementType();
     EVT PartEltVT = PartVT.getVectorElementType();
@@ -24307,12 +24419,17 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
     return Val;
   }
 
-  if (ValueVT.isScalableVector() && PartVT.isScalableVector()) {
+  if ((ValueVT.isScalableVector() || ValueVT.isFixedLengthVector()) &&
+      PartVT.isScalableVector()) {
     LLVMContext &Context = *DAG.getContext();
     SDValue Val = Parts[0];
     EVT ValueEltVT = ValueVT.getVectorElementType();
     EVT PartEltVT = PartVT.getVectorElementType();
     unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinValue();
+    if (ValueVT.isFixedLengthVector())
+      ValueVTBitSize = getContainerForFixedLengthVector(ValueVT.getSimpleVT())
+                           .getSizeInBits()
+                           .getKnownMinValue();
     unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinValue();
     if (PartVTBitSize % ValueVTBitSize == 0) {
       assert(PartVTBitSize >= ValueVTBitSize);
@@ -24330,7 +24447,10 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
             EVT::getVectorVT(Context, ValueEltVT, Count, /*IsScalable=*/true);
         Val = DAG.getNode(ISD::BITCAST, DL, SameEltTypeVT, Val);
       }
-      Val = DAG.getExtractSubvector(DL, ValueVT, Val, 0);
+      if (ValueVT.isFixedLengthVector())
+        Val = convertFromScalableVector(ValueVT, Val, DAG, Subtarget);
+      else
+        Val = DAG.getExtractSubvector(DL, ValueVT, Val, 0);
       return Val;
     }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 433b8be..fb63ebc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -431,8 +431,8 @@ public:
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
-                            ArrayRef<unsigned> Indices,
-                            unsigned Factor) const override;
+                            ArrayRef<unsigned> Indices, unsigned Factor,
+                            const APInt &GapMask) const override;
 
   bool lowerInterleavedStore(Instruction *Store, Value *Mask,
                              ShuffleVectorInst *SVI,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index d9c6101..878a0ec 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -261,6 +261,12 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   // Indicates the EEW of a vector instruction's destination operand.
   EEW DestEEW = EEWSEWx1;
   let TSFlags{25-24} = DestEEW.Value;
+
+  // Some vector instructions like vslidedown/vrgather will read elements past
+  // VL, and should be marked to make sure RISCVVLOptimizer doesn't reduce its
+  // operands' VLs.
+  bit ReadsPastVL = 0;
+  let TSFlags{26} = ReadsPastVL;
 }
 
 class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 8bd3830..836a2b1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1694,6 +1694,16 @@ multiclass SelectCC_GPR_riirr<DAGOperand valty, DAGOperand imm> {
                 valty:$truev, valty:$falsev), []>;
 }
 
+let Predicates = [IsRV32] in {
+def : Pat<(i32 (setlt (i32 GPR:$rs1), 0)), (SRLI GPR:$rs1, 31)>; // compressible
+}
+let Predicates = [IsRV64] in {
+def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x0000000080000000)), 0)),
+          (XORI (i64 (SRLIW GPR:$rs1, 31)), 1)>;
+def : Pat<(i64 (setlt (i64 GPR:$rs1), 0)), (SRLI GPR:$rs1, 63)>; // compressible
+def : Pat<(i64 (setlt (sext_inreg GPR:$rs1, i32), 0)), (SRLIW GPR:$rs1, 31)>;
+}
+
 /// Branches and jumps
 
 // Match `riscv_brcc` and lower to the appropriate RISC-V branch instruction.
@@ -2367,6 +2377,7 @@ include "RISCVInstrInfoXqccmp.td"
 include "RISCVInstrInfoXMips.td"
 include "RISCVInstrInfoXRivos.td"
 include "RISCVInstrInfoXAndes.td"
+include "RISCVInstrInfoXSpacemiT.td"
 
 //===----------------------------------------------------------------------===//
 // Global ISel
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 8297d50..1e22c2d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -18,7 +18,26 @@
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
 
-def simm10 : RISCVSImmLeafOp<10>;
+def simm10 : RISCVSImmOp<10>;
+
+def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> {
+  let RenderMethod = "addSImm8UnsignedOperands";
+}
+
+// A 8-bit signed immediate allowing range [-128, 255]
+// but represented as [-128, 255].
+def simm8_unsigned : RISCVOp {
+  let ParserMatchClass = SImm8UnsignedAsmOperand;
+  let EncoderMethod = "getImmOpValue";
+  let DecoderMethod = "decodeSImmOperand<8>";
+  let OperandType = "OPERAND_SIMM10";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return isInt<8>(Imm);
+  }];
+}
 
 def SImm10UnsignedAsmOperand : SImmAsmOperand<10, "Unsigned"> {
   let RenderMethod = "addSImm10UnsignedOperands";
@@ -43,49 +62,40 @@ def simm10_unsigned : RISCVOp {
 // Instruction class templates
 //===----------------------------------------------------------------------===//
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class PLI_i<bits<7> funct7, string opcodestr>
-    : RVInst<(outs GPR:$rd), (ins simm10:$imm10), opcodestr, "$rd, $imm10", [],
+// Common base for pli.b/h/w and plui.h/w
+class RVPLoadImm_i<bits<7> funct7, dag ins, string opcodestr,
+                   string argstr>
+    : RVInst<(outs GPR:$rd), ins, opcodestr, argstr, [],
              InstFormatOther> {
-  bits<10> imm10;
   bits<5> rd;
 
   let Inst{31-25} = funct7;
-  let Inst{24-16} = imm10{8-0};
-  let Inst{15}    = imm10{9};
   let Inst{14-12} = 0b010;
   let Inst{11-7} = rd;
   let Inst{6-0} = OPC_OP_IMM_32.Value;
+
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class PLUI_i<bits<7> funct7, string opcodestr>
-    : RVInst<(outs GPR:$rd), (ins simm10_unsigned:$imm10), opcodestr,
-             "$rd, $imm10", [], InstFormatOther> {
+// Base for pli.h/w.
+class PLI_i<bits<7> funct7, string opcodestr>
+    : RVPLoadImm_i<funct7, (ins simm10:$imm10), opcodestr, "$rd, $imm10"> {
   bits<10> imm10;
-  bits<5> rd;
 
-  let Inst{31-25} = funct7;
-  let Inst{24}    = imm10{0};
-  let Inst{23-15} = imm10{9-1};
-  let Inst{14-12} = 0b010;
-  let Inst{11-7} = rd;
-  let Inst{6-0} = OPC_OP_IMM_32.Value;
+  let Inst{24-16} = imm10{8-0};
+  let Inst{15}    = imm10{9};
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class PLI_B_i<bits<8> funct8, string opcodestr>
-    : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [],
-             InstFormatOther> {
-  bits<8> uimm8;
-  bits<5> rd;
+// Base for plui.h/w.
+class PLUI_i<bits<7> funct7, string opcodestr>
+    : RVPLoadImm_i<funct7, (ins simm10_unsigned:$imm10), opcodestr,
+                   "$rd, $imm10"> {
+  bits<10> imm10;
 
-  let Inst{31-24} = funct8;
-  let Inst{23-16} = uimm8;
-  let Inst{15}    = 0b0;
-  let Inst{14-12} = 0b010;
-  let Inst{11-7} = rd;
-  let Inst{6-0} = OPC_OP_IMM_32.Value;
+  let Inst{24}    = imm10{0};
+  let Inst{23-15} = imm10{9-1};
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
@@ -98,6 +108,14 @@ class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
   let Inst{27}    = 0b0;
 }
 
+class RVPShiftD_ri<bits<3> f, bits<3> funct3, string opcodestr>
+    : RVPShift_ri<f, funct3, opcodestr, uimm6> {
+  bits<6> shamt;
+
+  let Inst{26} = 0b1;
+  let Inst{25-20} = shamt;
+}
+
 class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr>
     : RVPShift_ri<f, funct3, opcodestr, uimm5> {
   bits<5> shamt;
@@ -131,59 +149,477 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
   let Inst{24-20} = uf;
 }
 
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPBinaryScalar_rr<bits<3> f, bits<2> w, bits<3> funct3, string opcodestr>
+    : RVInstRBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
+                  (ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+  let Inst{31} = 0b1;
+  let Inst{30-28} = f;
+  let Inst{27} = 0b1;
+  let Inst{26-25} = w;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPBinary_rr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
+    : RVInstRBase<funct3, OPC_OP_32, (outs GPR:$rd),
+                  (ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+  let Inst{31} = 0b1;
+  let Inst{30-27} = f;
+  let Inst{26-25} = w;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
+    : RVInstRBase<funct3, OPC_OP_32, (outs GPR:$rd_wb),
+                  (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr,
+                  "$rd, $rs1, $rs2"> {
+  let Inst{31} = 0b1;
+  let Inst{30-27} = f;
+  let Inst{26-25} = w;
+
+  let Constraints = "$rd = $rd_wb";
+}
+
+// Common base for pli.db/h/w and plui.dh/w
+class RVPPairLoadImm_i<bits<7> funct7, dag ins, string opcodestr,
+                       string argstr>
+    : RVInst<(outs GPRPairRV32:$rd), ins, opcodestr, argstr, [],
+             InstFormatOther> {
+  bits<5> rd;
+
+  let Inst{31-25} = funct7;
+  let Inst{14-12} = 0b010;
+  let Inst{11-8}  = rd{4-1};
+  let Inst{7}     = 0b0;
+  let Inst{6-0}   = OPC_OP_IMM_32.Value;
+
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtP] in {
-let IsSignExtendingOpW = 1 in
-def CLS    : Unary_r<0b011000000011, 0b001, "cls">;
-def ABS    : Unary_r<0b011000000111, 0b001, "abs">;
+  let IsSignExtendingOpW = 1 in
+  def CLS    : Unary_r<0b011000000011, 0b001, "cls">;
+  def ABS    : Unary_r<0b011000000111, 0b001, "abs">;
 } // Predicates = [HasStdExtP]
-let Predicates = [HasStdExtP, IsRV32] in
-def REV_RV32  : Unary_r<0b011010011111, 0b101, "rev">;
+
+let Predicates = [HasStdExtP, IsRV32] in {
+  def REV_RV32  : Unary_r<0b011010011111, 0b101, "rev">;
+} // Predicates = [HasStdExtP, IsRV32]
 
 let Predicates = [HasStdExtP, IsRV64] in {
-def REV16      : Unary_r<0b011010110000, 0b101, "rev16">;
-def REV_RV64   : Unary_r<0b011010111111, 0b101, "rev">;
+  def REV16      : Unary_r<0b011010110000, 0b101, "rev16">;
+  def REV_RV64   : Unary_r<0b011010111111, 0b101, "rev">;
 
-let IsSignExtendingOpW = 1 in {
-def CLSW  : UnaryW_r<0b011000000011, 0b001, "clsw">;
-def ABSW  : UnaryW_r<0b011000000111, 0b001, "absw">;
-}
+  let IsSignExtendingOpW = 1 in {
+    def CLSW  : UnaryW_r<0b011000000011, 0b001, "clsw">;
+    def ABSW  : UnaryW_r<0b011000000111, 0b001, "absw">;
+  }
 } // Predicates = [HasStdExtP, IsRV64]
 
 let Predicates = [HasStdExtP] in {
-def PSLLI_B  : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
-def PSLLI_H  : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
-def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
+  def PSLLI_B  : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
+  def PSLLI_H  : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
+  def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
 } // Predicates = [HasStdExtP]
-let DecoderNamespace = "RV32Only",
-    Predicates = [HasStdExtP, IsRV32] in
-def SSLAI    : RVPShiftW_ri<0b101, 0b010, "sslai">;
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def SSLAI    : RVPShiftW_ri<0b101, 0b010, "sslai">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
 let Predicates = [HasStdExtP, IsRV64] in {
-def PSLLI_W  : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
-def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
+  def PSLLI_W  : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
+  def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
 } // Predicates = [HasStdExtP, IsRV64]
 
 let Predicates = [HasStdExtP] in
 def PLI_H : PLI_i<0b1011000, "pli.h">;
 let Predicates = [HasStdExtP, IsRV64] in
 def PLI_W : PLI_i<0b1011001, "pli.w">;
-let Predicates = [HasStdExtP] in
-def PLI_B : PLI_B_i<0b10110100, "pli.b">;
+let Predicates = [HasStdExtP] in {
+  def PLI_B : RVPLoadImm_i<0b1011010, (ins simm8_unsigned:$imm8), "pli.b",
+                           "$rd, $imm8"> {
+    bits<8> imm8;
+
+    let Inst{24}    = 0b0;
+    let Inst{23-16} = imm8;
+    let Inst{15}    = 0b0;
+  }
+}
 
 let Predicates = [HasStdExtP] in {
-def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
-def PSABS_H   : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
-def PSABS_B   : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
+  def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
+  def PSABS_H   : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
+  def PSABS_B   : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
 } // Predicates = [HasStdExtP]
 let Predicates = [HasStdExtP, IsRV64] in {
-def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
-def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
+  def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
+  def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
 } // Predicates = [HasStdExtP, IsRV64]
 
 let Predicates = [HasStdExtP] in
 def PLUI_H : PLUI_i<0b1111000, "plui.h">;
 let Predicates = [HasStdExtP, IsRV64] in
 def PLUI_W : PLUI_i<0b1111001, "plui.w">;
+
+let Predicates = [HasStdExtP] in {
+  def PSLL_HS   : RVPBinaryScalar_rr<0b000, 0b00, 0b010, "psll.hs">;
+  def PSLL_BS   : RVPBinaryScalar_rr<0b000, 0b10, 0b010, "psll.bs">;
+
+  def PADD_HS   : RVPBinaryScalar_rr<0b001, 0b00, 0b010, "padd.hs">;
+  def PADD_BS   : RVPBinaryScalar_rr<0b001, 0b10, 0b010, "padd.bs">;
+
+  def PSSHA_HS  : RVPBinaryScalar_rr<0b110, 0b00, 0b010, "pssha.hs">;
+
+  def PSSHAR_HS : RVPBinaryScalar_rr<0b111, 0b00, 0b010, "psshar.hs">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def SSHA      : RVPBinaryScalar_rr<0b110, 0b01, 0b010, "ssha">;
+
+  def SSHAR     : RVPBinaryScalar_rr<0b111, 0b01, 0b010, "sshar">;
+} // Predicates = [HasStdExtP, IsRV32]
+let Predicates = [HasStdExtP, IsRV64] in {
+  def PSLL_WS   : RVPBinaryScalar_rr<0b000, 0b01, 0b010, "psll.ws">;
+
+  def PADD_WS   : RVPBinaryScalar_rr<0b001, 0b01, 0b010, "padd.ws">;
+
+  def PSSHA_WS  : RVPBinaryScalar_rr<0b110, 0b01, 0b010, "pssha.ws">;
+  def SHA       : RVPBinaryScalar_rr<0b110, 0b11, 0b010, "sha">;
+
+  def PSSHAR_WS : RVPBinaryScalar_rr<0b111, 0b01, 0b010, "psshar.ws">;
+  def SHAR      : RVPBinaryScalar_rr<0b111, 0b11, 0b010, "shar">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+  def PSRLI_B    : RVPShiftB_ri<0b000, 0b100, "psrli.b">;
+  def PSRLI_H    : RVPShiftH_ri<0b000, 0b100, "psrli.h">;
+
+  def PUSATI_H   : RVPShiftH_ri<0b010, 0b100, "pusati.h">;
+
+  def PSRAI_B    : RVPShiftB_ri<0b100, 0b100, "psrai.b">;
+  def PSRAI_H    : RVPShiftH_ri<0b100, 0b100, "psrai.h">;
+
+  def PSRARI_H   : RVPShiftH_ri<0b101, 0b100, "psrari.h">;
+
+  def PSATI_H    : RVPShiftH_ri<0b110, 0b100, "psati.h">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def USATI_RV32 : RVPShiftW_ri<0b010, 0b100, "usati">;
+
+  def SRARI_RV32 : RVPShiftW_ri<0b101, 0b100, "srari">;
+
+  def SATI_RV32  : RVPShiftW_ri<0b110, 0b100, "sati">;
+} // Predicates = [HasStdExtP, IsRV32]
+let Predicates = [HasStdExtP, IsRV64] in {
+  def PSRLI_W    : RVPShiftW_ri<0b000, 0b100, "psrli.w">;
+  def PSRAI_W    : RVPShiftW_ri<0b100, 0b100, "psrai.w">;
+
+  def PUSATI_W   : RVPShiftW_ri<0b010, 0b100, "pusati.w">;
+  def USATI_RV64 : RVPShiftD_ri<0b010, 0b100, "usati">;
+
+  def PSRARI_W   : RVPShiftW_ri<0b101, 0b100, "psrari.w">;
+  def SRARI_RV64 : RVPShiftD_ri<0b101, 0b100, "srari">;
+
+  def PSATI_W    : RVPShiftW_ri<0b110, 0b100, "psati.w">;
+  def SATI_RV64  : RVPShiftD_ri<0b110, 0b100, "sati">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+  def PSRL_HS     : RVPBinaryScalar_rr<0b000, 0b00, 0b100, "psrl.hs">;
+  def PSRL_BS     : RVPBinaryScalar_rr<0b000, 0b10, 0b100, "psrl.bs">;
+
+  def PREDSUM_HS  : RVPBinaryScalar_rr<0b001, 0b00, 0b100, "predsum.hs">;
+  def PREDSUM_BS  : RVPBinaryScalar_rr<0b001, 0b10, 0b100, "predsum.bs">;
+
+  def PREDSUMU_HS : RVPBinaryScalar_rr<0b011, 0b00, 0b100, "predsumu.hs">;
+  def PREDSUMU_BS : RVPBinaryScalar_rr<0b011, 0b10, 0b100, "predsumu.bs">;
+
+  def PSRA_HS     : RVPBinaryScalar_rr<0b100, 0b00, 0b100, "psra.hs">;
+  def PSRA_BS     : RVPBinaryScalar_rr<0b100, 0b10, 0b100, "psra.bs">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV64] in {
+  def PSRL_WS     : RVPBinaryScalar_rr<0b000, 0b01, 0b100, "psrl.ws">;
+
+  def PREDSUM_WS  : RVPBinaryScalar_rr<0b001, 0b01, 0b100, "predsum.ws">;
+
+  def PREDSUMU_WS : RVPBinaryScalar_rr<0b011, 0b01, 0b100, "predsumu.ws">;
+
+  def PSRA_WS     : RVPBinaryScalar_rr<0b100, 0b01, 0b100, "psra.ws">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+  def PADD_H   : RVPBinary_rr<0b0000, 0b00, 0b000, "padd.h">;
+  def PADD_B   : RVPBinary_rr<0b0000, 0b10, 0b000, "padd.b">;
+
+  def PSADD_H  : RVPBinary_rr<0b0010, 0b00, 0b000, "psadd.h">;
+  def PSADD_B  : RVPBinary_rr<0b0010, 0b10, 0b000, "psadd.b">;
+
+  def PAADD_H  : RVPBinary_rr<0b0011, 0b00, 0b000, "paadd.h">;
+  def PAADD_B  : RVPBinary_rr<0b0011, 0b10, 0b000, "paadd.b">;
+
+  def PSADDU_H : RVPBinary_rr<0b0110, 0b00, 0b000, "psaddu.h">;
+  def PSADDU_B : RVPBinary_rr<0b0110, 0b10, 0b000, "psaddu.b">;
+
+  def PAADDU_H : RVPBinary_rr<0b0111, 0b00, 0b000, "paaddu.h">;
+  def PAADDU_B : RVPBinary_rr<0b0111, 0b10, 0b000, "paaddu.b">;
+
+  def PSUB_H   : RVPBinary_rr<0b1000, 0b00, 0b000, "psub.h">;
+  def PSUB_B   : RVPBinary_rr<0b1000, 0b10, 0b000, "psub.b">;
+
+  def PDIF_H   : RVPBinary_rr<0b1001, 0b00, 0b000, "pdif.h">;
+  def PDIF_B   : RVPBinary_rr<0b1001, 0b10, 0b000, "pdif.b">;
+
+  def PSSUB_H  : RVPBinary_rr<0b1010, 0b00, 0b000, "pssub.h">;
+  def PSSUB_B  : RVPBinary_rr<0b1010, 0b10, 0b000, "pssub.b">;
+
+  def PASUB_H  : RVPBinary_rr<0b1011, 0b00, 0b000, "pasub.h">;
+  def PASUB_B  : RVPBinary_rr<0b1011, 0b10, 0b000, "pasub.b">;
+
+  def PDIFU_H  : RVPBinary_rr<0b1101, 0b00, 0b000, "pdifu.h">;
+  def PDIFU_B  : RVPBinary_rr<0b1101, 0b10, 0b000, "pdifu.b">;
+
+  def PSSUBU_H : RVPBinary_rr<0b1110, 0b00, 0b000, "pssubu.h">;
+  def PSSUBU_B : RVPBinary_rr<0b1110, 0b10, 0b000, "pssubu.b">;
+
+  def PASUBU_H : RVPBinary_rr<0b1111, 0b00, 0b000, "pasubu.h">;
+  def PASUBU_B : RVPBinary_rr<0b1111, 0b10, 0b000, "pasubu.b">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def SADD     : RVPBinary_rr<0b0010, 0b01, 0b000, "sadd">;
+
+  def AADD     : RVPBinary_rr<0b0011, 0b01, 0b000, "aadd">;
+
+  def SADDU    : RVPBinary_rr<0b0110, 0b01, 0b000, "saddu">;
+
+  def AADDU    : RVPBinary_rr<0b0111, 0b01, 0b000, "aaddu">;
+
+  def SSUB     : RVPBinary_rr<0b1010, 0b01, 0b000, "ssub">;
+
+  def ASUB     : RVPBinary_rr<0b1011, 0b01, 0b000, "asub">;
+
+  def SSUBU    : RVPBinary_rr<0b1110, 0b01, 0b000, "ssubu">;
+
+  def ASUBU    : RVPBinary_rr<0b1111, 0b01, 0b000, "asubu">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+  def PADD_W   : RVPBinary_rr<0b0000, 0b01, 0b000, "padd.w">;
+
+  def PSADD_W  : RVPBinary_rr<0b0010, 0b01, 0b000, "psadd.w">;
+
+  def PAADD_W  : RVPBinary_rr<0b0011, 0b01, 0b000, "paadd.w">;
+
+  def PSADDU_W : RVPBinary_rr<0b0110, 0b01, 0b000, "psaddu.w">;
+
+  def PAADDU_W : RVPBinary_rr<0b0111, 0b01, 0b000, "paaddu.w">;
+
+  def PSUB_W   : RVPBinary_rr<0b1000, 0b01, 0b000, "psub.w">;
+
+  def PSSUB_W  : RVPBinary_rr<0b1010, 0b01, 0b000, "pssub.w">;
+
+  def PASUB_W  : RVPBinary_rr<0b1011, 0b01, 0b000, "pasub.w">;
+
+  def PSSUBU_W : RVPBinary_rr<0b1110, 0b01, 0b000, "pssubu.w">;
+
+  def PASUBU_W : RVPBinary_rr<0b1111, 0b01, 0b000, "pasubu.w">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+  def SLX          : RVPBinary_rr<0b0001, 0b11, 0b001, "slx">;
+
+  def PMUL_H_B01   : RVPBinary_rr<0b0010, 0b00, 0b001, "pmul.h.b01">;
+
+  def MVM          : RVPTernary_rrr<0b0101, 0b00, 0b001, "mvm">;
+  def MVMN         : RVPTernary_rrr<0b0101, 0b01, 0b001, "mvmn">;
+  def MERGE        : RVPTernary_rrr<0b0101, 0b10, 0b001, "merge">;
+  def SRX          : RVPTernary_rrr<0b0101, 0b11, 0b001, "srx">;
+
+  def PMULU_H_B01  : RVPBinary_rr<0b0110, 0b00, 0b001, "pmulu.h.b01">;
+  def PDIFSUMU_B   : RVPBinary_rr<0b0110, 0b10, 0b001, "pdifsumu.b">;
+
+  def PDIFSUMAU_B  : RVPTernary_rrr<0b0111, 0b10, 0b001, "pdifsumau.b">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def MUL_H01      : RVPBinary_rr<0b0010, 0b01, 0b001, "mul.h01">;
+
+  def MACC_H01     : RVPTernary_rrr<0b0011, 0b01, 0b001, "macc.h01">;
+
+  def MULU_H01     : RVPBinary_rr<0b0110, 0b01, 0b001, "mulu.h01">;
+
+  def MACCU_H01    : RVPTernary_rrr<0b0111, 0b01, 0b001, "maccu.h01">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+  def PMUL_W_H01   : RVPBinary_rr<0b0010, 0b01, 0b001, "pmul.w.h01">;
+  def MUL_W01      : RVPBinary_rr<0b0010, 0b11, 0b001, "mul.w01">;
+
+  def PMACC_W_H01  : RVPTernary_rrr<0b0011, 0b01, 0b001, "pmacc.w.h01">;
+  def MACC_W01     : RVPTernary_rrr<0b0011, 0b11, 0b001, "macc.w01">;
+
+  def PMULU_W_H01  : RVPBinary_rr<0b0110, 0b01, 0b001, "pmulu.w.h01">;
+  def MULU_W01     : RVPBinary_rr<0b0110, 0b11, 0b001, "mulu.w01">;
+
+  def PMACCU_W_H01 : RVPTernary_rrr<0b0111, 0b01, 0b001, "pmaccu.w.h01">;
+  def MACCU_W01    : RVPTernary_rrr<0b0111, 0b11, 0b001, "maccu.w01">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+// Note the spec has a 3-bit f field in bits 30:28 with 0 in bit 27.
+// Here we include the 0 in the f field to reduce number of tablegen classes.
+let Predicates = [HasStdExtP] in {
+  def PSH1ADD_H   : RVPBinary_rr<0b0100, 0b00, 0b010, "psh1add.h">;
+
+  def PSSH1SADD_H : RVPBinary_rr<0b0110, 0b00, 0b010, "pssh1sadd.h">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def SSH1SADD    : RVPBinary_rr<0b0110, 0b01, 0b010, "ssh1sadd">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+  def PSH1ADD_W   : RVPBinary_rr<0b0100, 0b01, 0b010, "psh1add.w">;
+
+  def PSSH1SADD_W : RVPBinary_rr<0b0110, 0b01, 0b010, "pssh1sadd.w">;
+
+  def UNZIP8P     : RVPBinary_rr<0b1100, 0b00, 0b010, "unzip8p">;
+  def UNZIP16P    : RVPBinary_rr<0b1100, 0b01, 0b010, "unzip16p">;
+  def UNZIP8HP    : RVPBinary_rr<0b1100, 0b10, 0b010, "unzip8hp">;
+  def UNZIP16HP   : RVPBinary_rr<0b1100, 0b11, 0b010, "unzip16hp">;
+
+  def ZIP8P       : RVPBinary_rr<0b1110, 0b00, 0b010, "zip8p">;
+  def ZIP16P      : RVPBinary_rr<0b1110, 0b01, 0b010, "zip16p">;
+  def ZIP8HP      : RVPBinary_rr<0b1110, 0b10, 0b010, "zip8hp">;
+  def ZIP16HP     : RVPBinary_rr<0b1110, 0b11, 0b010, "zip16hp">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+  def PMUL_H_B00    : RVPBinary_rr<0b0000, 0b00, 0b011, "pmul.h.b00">;
+
+  def PMUL_H_B11    : RVPBinary_rr<0b0010, 0b00, 0b011, "pmul.h.b11">;
+
+  def PMULU_H_B00   : RVPBinary_rr<0b0100, 0b00, 0b011, "pmulu.h.b00">;
+
+  def PMULU_H_B11   : RVPBinary_rr<0b0110, 0b00, 0b011, "pmulu.h.b11">;
+
+  def PMULSU_H_B00  : RVPBinary_rr<0b1100, 0b00, 0b011, "pmulsu.h.b00">;
+
+  def PMULSU_H_B11  : RVPBinary_rr<0b1110, 0b00, 0b011, "pmulsu.h.b11">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def MUL_H00       : RVPBinary_rr<0b0000, 0b01, 0b011, "mul.h00">;
+
+  def MACC_H00      : RVPTernary_rrr<0b0001, 0b01, 0b011, "macc.h00">;
+
+  def MUL_H11       : RVPBinary_rr<0b0010, 0b01, 0b011, "mul.h11">;
+
+  def MACC_H11      : RVPTernary_rrr<0b0011, 0b01, 0b011, "macc.h11">;
+
+  def MULU_H00      : RVPBinary_rr<0b0100, 0b01, 0b011, "mulu.h00">;
+
+  def MACCU_H00     : RVPTernary_rrr<0b0101, 0b01, 0b011, "maccu.h00">;
+
+  def MULU_H11      : RVPBinary_rr<0b0110, 0b01, 0b011, "mulu.h11">;
+
+  def MACCU_H11     : RVPTernary_rrr<0b0111, 0b01, 0b011, "maccu.h11">;
+
+  def MULSU_H00     : RVPBinary_rr<0b1100, 0b01, 0b011, "mulsu.h00">;
+
+  def MACCSU_H00    : RVPTernary_rrr<0b1101, 0b01, 0b011, "maccsu.h00">;
+
+  def MULSU_H11     : RVPBinary_rr<0b1110, 0b01, 0b011, "mulsu.h11">;
+
+  def MACCSU_H11    : RVPTernary_rrr<0b1111, 0b01, 0b011, "maccsu.h11">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+  def PMUL_W_H00    : RVPBinary_rr<0b0000, 0b01, 0b011, "pmul.w.h00">;
+  def MUL_W00       : RVPBinary_rr<0b0000, 0b11, 0b011, "mul.w00">;
+
+  def PMACC_W_H00   : RVPTernary_rrr<0b0001, 0b01, 0b011, "pmacc.w.h00">;
+  def MACC_W00      : RVPTernary_rrr<0b0001, 0b11, 0b011, "macc.w00">;
+
+  def PMUL_W_H11    : RVPBinary_rr<0b0010, 0b01, 0b011, "pmul.w.h11">;
+  def MUL_W11       : RVPBinary_rr<0b0010, 0b11, 0b011, "mul.w11">;
+
+  def PMACC_W_H11   : RVPTernary_rrr<0b0011, 0b01, 0b011, "pmacc.w.h11">;
+  def MACC_W11      : RVPTernary_rrr<0b0011, 0b11, 0b011, "macc.w11">;
+
+  def PMULU_W_H00   : RVPBinary_rr<0b0100, 0b01, 0b011, "pmulu.w.h00">;
+  def MULU_W00      : RVPBinary_rr<0b0100, 0b11, 0b011, "mulu.w00">;
+
+  def PMACCU_W_H00  : RVPTernary_rrr<0b0101, 0b01, 0b011, "pmaccu.w.h00">;
+  def MACCU_W00     : RVPTernary_rrr<0b0101, 0b11, 0b011, "maccu.w00">;
+
+  def PMULU_W_H11   : RVPBinary_rr<0b0110, 0b01, 0b011, "pmulu.w.h11">;
+  def MULU_W11      : RVPBinary_rr<0b0110, 0b11, 0b011, "mulu.w11">;
+
+  def PMACCU_W_H11  : RVPTernary_rrr<0b0111, 0b01, 0b011, "pmaccu.w.h11">;
+  def MACCU_W11     : RVPTernary_rrr<0b0111, 0b11, 0b011, "maccu.w11">;
+
+  def PMULSU_W_H00  : RVPBinary_rr<0b1100, 0b01, 0b011, "pmulsu.w.h00">;
+  def MULSU_W00     : RVPBinary_rr<0b1100, 0b11, 0b011, "mulsu.w00">;
+
+  def PMACCSU_W_H00 : RVPTernary_rrr<0b1101, 0b01, 0b011, "pmaccsu.w.h00">;
+  def MACCSU_W00    : RVPTernary_rrr<0b1101, 0b11, 0b011, "maccsu.w00">;
+
+  def PMULSU_W_H11  : RVPBinary_rr<0b1110, 0b01, 0b011, "pmulsu.w.h11">;
+  def MULSU_W11     : RVPBinary_rr<0b1110, 0b11, 0b011, "mulsu.w11">;
+
+  def PMACCSU_W_H11 : RVPTernary_rrr<0b1111, 0b01, 0b011, "pmaccsu.w.h11">;
+  def MACCSU_W11    : RVPTernary_rrr<0b1111, 0b11, 0b011, "maccsu.w11">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+// Note the spec has a 3-bit f field in bits 30:28 with 0 in bit 27.
+// Here we include the 0 in the f field to reduce number of tablegen classes.
+let Predicates = [HasStdExtP] in {
+  def PPACK_H     : RVPBinary_rr<0b0000, 0b00, 0b100, "ppack.h">;
+
+  def PPACKBT_H   : RVPBinary_rr<0b0010, 0b00, 0b100, "ppackbt.h">;
+
+  def PPACKTB_H   : RVPBinary_rr<0b0100, 0b00, 0b100, "ppacktb.h">;
+
+  def PPACKT_H    : RVPBinary_rr<0b0110, 0b00, 0b100, "ppackt.h">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def PACKBT_RV32 : RVPBinary_rr<0b0010, 0b01, 0b100, "packbt">;
+
+  def PACKTB_RV32 : RVPBinary_rr<0b0100, 0b01, 0b100, "packtb">;
+
+  def PACKT_RV32  : RVPBinary_rr<0b0110, 0b01, 0b100, "packt">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+  def PPACK_W     : RVPBinary_rr<0b0000, 0b01, 0b100, "ppack.w">;
+
+  def PPACKBT_W   : RVPBinary_rr<0b0010, 0b01, 0b100, "ppackbt.w">;
+  def PACKBT_RV64 : RVPBinary_rr<0b0010, 0b11, 0b100, "packbt">;
+
+  def PPACKTB_W   : RVPBinary_rr<0b0100, 0b01, 0b100, "ppacktb.w">;
+  def PACKTB_RV64 : RVPBinary_rr<0b0100, 0b11, 0b100, "packtb">;
+
+  def PPACKT_W    : RVPBinary_rr<0b0110, 0b01, 0b100, "ppackt.w">;
+  def PACKT_RV64  : RVPBinary_rr<0b0110, 0b11, 0b100, "packt">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP, IsRV32] in {
+  def PLI_DH : RVPPairLoadImm_i<0b0011000, (ins simm10:$imm10), "pli.dh",
+                                "$rd, $imm10"> {
+    bits<10> imm10;
+
+    let Inst{24-16} = imm10{8-0};
+    let Inst{15}    = imm10{9};
+  }
+
+  def PLI_DB : RVPPairLoadImm_i<0b0011010, (ins simm8_unsigned:$imm8), "pli.db",
+                                "$rd, $imm8"> {
+    bits<8> imm8;
+
+    let Inst{24}    = 0b0;
+    let Inst{23-16} = imm8;
+    let Inst{15}    = 0b0;
+  }
+
+  def PLUI_DH : RVPPairLoadImm_i<0b0111000, (ins simm10_unsigned:$imm10),
+                                 "plui.dh", "$rd, $imm10"> {
+    bits<10> imm10;
+
+    let Inst{24}    = imm10{0};
+    let Inst{23-15} = imm10{9-1};
+  }
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 33c7138..cebab21 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1703,8 +1703,9 @@ let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
 defm VSLIDEUP_V : VSLD_IV_X_I<"vslideup", 0b001110, /*slidesUp=*/true>;
 defm VSLIDE1UP_V : VSLD1_MV_X<"vslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
+let ReadsPastVL = 1 in
 defm VSLIDEDOWN_V : VSLD_IV_X_I<"vslidedown", 0b001111, /*slidesUp=*/false>;
-let ElementsDependOn = EltDepsVL in
+let ElementsDependOn = EltDepsVL, ReadsPastVL = 1 in
 defm VSLIDE1DOWN_V : VSLD1_MV_X<"vslide1down", 0b001111>;
 } // Predicates = [HasVInstructions]
 
@@ -1712,19 +1713,19 @@ let Predicates = [HasVInstructionsAnyF] in {
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
 defm VFSLIDE1UP_V : VSLD1_FV_F<"vfslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-let ElementsDependOn = EltDepsVL in
+let ElementsDependOn = EltDepsVL, ReadsPastVL = 1 in
 defm VFSLIDE1DOWN_V : VSLD1_FV_F<"vfslide1down", 0b001111>;
 } // Predicates = [HasVInstructionsAnyF]
 
 let Predicates = [HasVInstructions] in {
 // Vector Register Gather Instruction
-let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather, ReadsPastVL = 1 in {
 defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100>;
 def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">,
                       SchedBinaryMC<"WriteVRGatherEI16VV",
                                     "ReadVRGatherEI16VV_data",
                                     "ReadVRGatherEI16VV_index">;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather, ReadsPastVL = 1
 
 // Vector Compress Instruction
 let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress, ElementsDependOn = EltDepsVLMask in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index c75addd9..1fb30a0b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -420,7 +420,7 @@ class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr>
 }
 
 class NDSRVInstVBFHCvt<bits<5> vs1, string opcodestr>
-    : RVInst<(outs VR:$vd), (ins VR:$vs2, VMaskOp:$vm),
+    : RVInst<(outs VR:$vd), (ins VR:$vs2),
              opcodestr, "$vd, $vs2", [], InstFormatR> {
   bits<5> vs2;
   bits<5> vd;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
index 0c8487c..889ea98 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
@@ -129,20 +129,20 @@ class Mips_prefetch_ri<dag outs, dag ins, string opcodestr, string argstr>
 // MIPS extensions
 //===----------------------------------------------------------------------===//
 let Predicates = [HasVendorXMIPSCBOP] ,DecoderNamespace = "Xmipscbop" in {
-  def MIPS_PREFETCH : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint),
-                                        "mips.pref", "$hint, ${imm9}(${rs1})">,
-                       Sched<[]>;
+  def MIPS_PREF : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint),
+                                    "mips.pref", "$hint, ${imm9}(${rs1})">,
+                   Sched<[]>;
 }
 
 let Predicates = [HasVendorXMIPSCBOP] in {
   // Prefetch Data Write.
   def : Pat<(prefetch (AddrRegImm9  (XLenVT GPR:$rs1), uimm9:$imm9),
              (i32 1), timm, (i32 1)),
-            (MIPS_PREFETCH GPR:$rs1, uimm9:$imm9, 9)>;
+            (MIPS_PREF GPR:$rs1, uimm9:$imm9, 9)>;
   // Prefetch Data Read.
   def : Pat<(prefetch (AddrRegImm9 (XLenVT GPR:$rs1), uimm9:$imm9),
              (i32 0), timm, (i32 1)),
-            (MIPS_PREFETCH GPR:$rs1, uimm9:$imm9, 8)>;
+            (MIPS_PREF GPR:$rs1, uimm9:$imm9, 8)>;
 }
 
 let Predicates = [HasVendorXMIPSCMov], hasSideEffects = 0, mayLoad = 0, mayStore = 0,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
index ebcf079..3a6ce3c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
@@ -58,7 +58,7 @@ class CustomRivosXVI<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
 
 let Predicates = [HasVendorXRivosVizip], DecoderNamespace = "XRivos",
   Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather,
-  Inst<6-0> = OPC_CUSTOM_2.Value in  {
+  Inst<6-0> = OPC_CUSTOM_2.Value, ReadsPastVL = 1 in  {
 defm RI_VZIPEVEN_V : VALU_IV_V<"ri.vzipeven", 0b001100>;
 defm RI_VZIPODD_V : VALU_IV_V<"ri.vzipodd", 0b011100>;
 defm RI_VZIP2A_V : VALU_IV_V<"ri.vzip2a", 0b000100>;
@@ -126,6 +126,7 @@ def RI_VINSERT : CustomRivosVXI<0b010000, OPMVX, (outs VR:$vd_wb),
                                 (ins VR:$vd, GPR:$rs1, uimm5:$imm),
                                 "ri.vinsert.v.x", "$vd, $rs1, $imm">;
 
+let ReadsPastVL = 1 in
 def RI_VEXTRACT : CustomRivosXVI<0b010111, OPMVV, (outs GPR:$rd),
                                 (ins VR:$vs2, uimm5:$imm),
                                 "ri.vextract.x.v", "$rd, $vs2, $imm">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index a47dfe3..b546339 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -74,6 +74,7 @@ class RVInstVCCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins,
   let Uses = [VL, VTYPE];
   let RVVConstraint = NoConstraint;
   let ElementsDependOn = EltDepsVLMask;
+  let ReadsPastVL = 1;
 }
 
 class RVInstVCFCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins,
@@ -98,6 +99,7 @@ class RVInstVCFCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins,
   let Uses = [VL, VTYPE];
   let RVVConstraint = NoConstraint;
   let ElementsDependOn = EltDepsVLMask;
+  let ReadsPastVL = 1;
 }
 
 class VCIXInfo<string suffix, VCIXType type, DAGOperand TyRd,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
index 66cb2d5..a5ee701 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
@@ -65,6 +65,7 @@ class SFInstTileMemOp<dag outs, dag ins, bits<3> nf, RISCVOpcode opcode,
   let Inst{6-0} = opcode.Value;
 
   let Uses = [VTYPE, VL];
+  let ReadsPastVL = 1;
 }
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
@@ -94,6 +95,7 @@ class SFInstTileMoveOp<bits<6> funct6, dag outs, dag ins, string opcodestr,
   let Inst{6-0} = OPC_OP_V.Value;
 
   let Uses = [VTYPE, VL];
+  let ReadsPastVL = 1;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
@@ -113,6 +115,7 @@ class SFInstMatmulF<dag outs, dag ins, string opcodestr, string argstr>
   let Inst{6-0} = OPC_OP_VE.Value;
 
   let Uses = [VTYPE, VL];
+  let ReadsPastVL = 1;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
@@ -135,6 +138,7 @@ class SFInstMatmulF8<bit a, bit b, dag outs, dag ins,
   let Inst{6-0} = OPC_OP_VE.Value;
 
   let Uses = [VTYPE, VL];
+  let ReadsPastVL = 1;
 }
 
 
@@ -167,6 +171,7 @@ class SFInstMatmulI8<bit funct6_1, bit a, bit b, dag outs, dag ins,
   let Inst{6-0} = OPC_OP_VE.Value;
 
   let Uses = [VTYPE, VL];
+  let ReadsPastVL = 1;
 }
 
 class I8Encode<bit encoding, string name> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td
new file mode 100644
index 0000000..980931e
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td
@@ -0,0 +1,139 @@
+//===-- RISCVInstrInfoXSpacemiT.td -------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the vendor extensions defined by SpacemiT.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand definitions.
+//===----------------------------------------------------------------------===//
+
+class SMTVDotOpcode<bits<7> val> {
+  bits<7> Value = val;
+}
+
+class SMTVEncoding2<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def OPMMA       : SMTVDotOpcode<0b1110001>;
+def OPMMA_SLIDE : SMTVDotOpcode<0b1110011>;
+
+//===----------------------------------------------------------------------===//
+// Vector Dot-Product Sign Encoding
+// Defines the signed/unsigned mixing modes for vector dot-product operations.
+// Encoding format: [1:0] bits
+//   00: UU (Unsigned x Unsigned)
+//   01: US (Unsigned x Signed)
+//   10: SU (Signed x Unsigned)
+//   11: SS (Signed x Signed)
+//===----------------------------------------------------------------------===//
+def SMT_VDot_UU        : SMTVEncoding2<0b00>;
+def SMT_VDot_US        : SMTVEncoding2<0b01>;
+def SMT_VDot_SU        : SMTVEncoding2<0b10>;
+def SMT_VDot_SS        : SMTVEncoding2<0b11>;
+
+//===----------------------------------------------------------------------===//
+// Vector Dot-Product Sliding Window Modes
+// Encoding format: [1:0] bits
+//   00: Slide1 (1-element sliding stride)
+//   01: Slide2 (2-element sliding stride)
+//   10: Slide3 (3-element sliding stride)
+//   11: Reserved
+//
+// Used in sliding-window dot-product operations:
+//   vd = vs1 • vs2.slide{1|2|3}  // • = dot product
+//===----------------------------------------------------------------------===//
+def SMT_VDot_Slide1 : SMTVEncoding2<0b00>;
+def SMT_VDot_Slide2 : SMTVEncoding2<0b01>;
+def SMT_VDot_Slide3 : SMTVEncoding2<0b10>;
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+// Base vector dot product (no slide) format.
+class RVInstSMTVDot<SMTVEncoding2 sign, string opcodestr, string argstr>
+    : RVInst<(outs VRM2:$vd), (ins VR:$vs1, VR:$vs2), opcodestr, argstr, [], InstFormatR> {
+  bits<5> vd;
+  bits<5> vs1;
+  bits<5> vs2;
+
+  let Inst{31-25} = OPMMA.Value;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = vs1;
+  let Inst{14} = 0b0;
+  let Inst{13-12} = sign.Value;
+  let Inst{11-8} = vd{4-1};
+  let Inst{7} = 0b0;
+  let Inst{6-0} = OPC_CUSTOM_1.Value;
+}
+
+// Sliding-window vector dot product format.
+class RVInstSMTVDotSlide<SMTVEncoding2 funct2, SMTVEncoding2 sign, string opcodestr, string argstr>
+    : RVInst<(outs VRM2:$vd), (ins VRM2:$vs1, VR:$vs2), opcodestr, argstr, [], InstFormatR> {
+  bits<5> vd;
+  bits<5> vs1;
+  bits<5> vs2;
+
+  let Inst{31-25} = OPMMA_SLIDE.Value;
+  let Inst{24-20} = vs2;
+  let Inst{19-16} = vs1{4-1};
+  let Inst{15-14} = funct2.Value;
+  let Inst{13-12} = sign.Value;
+  let Inst{11-8} = vd{4-1};
+  let Inst{7} = 0b0;
+  let Inst{6-0} = OPC_CUSTOM_1.Value;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let DecoderNamespace = "XSMT" in {
+
+let Predicates = [HasVendorXSMTVDot], ElementsDependOn = EltDepsVL in {
+// Base vector dot product (no slide) instructions
+// NOTE: Destination registers (vd) MUST be even-numbered (v0, v2, ..., v30)
+//       due to hardware alignment constraints. Using odd registers may cause undefined behavior.
+def VMADOT   : RVInstSMTVDot<SMT_VDot_SS, "smt.vmadot",   "$vd, $vs1, $vs2">;
+def VMADOTU  : RVInstSMTVDot<SMT_VDot_UU, "smt.vmadotu",  "$vd, $vs1, $vs2">;
+def VMADOTSU : RVInstSMTVDot<SMT_VDot_SU, "smt.vmadotsu", "$vd, $vs1, $vs2">;
+def VMADOTUS : RVInstSMTVDot<SMT_VDot_US, "smt.vmadotus", "$vd, $vs1, $vs2">;
+
+//===----------------------------------------------------------------------===//
+// Sliding-window Vector Dot Product Instructions
+//
+// The numeric suffix (1, 2, 3) specifies the stride of the sliding window:
+//   1: Window slides by 1 element per operation
+//   2: Window slides by 2 elements per operation
+//   3: Window slides by 3 elements per operation
+//
+// These instructions compute dot products with overlapping operand windows
+// where the window position increments by <N> elements between computations.
+//===----------------------------------------------------------------------===//
+// NOTE: Destination registers (vd) and first source register (vs1) MUST be
+//       even-numbered (v0, v2, ..., v30) due to hardware alignment constraints.
+//       Using odd registers may cause undefined behavior.
+def VMADOT1   : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_SS, "smt.vmadot1",   "$vd, $vs1, $vs2">;
+def VMADOT1U  : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_UU, "smt.vmadot1u",  "$vd, $vs1, $vs2">;
+def VMADOT1SU : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_SU, "smt.vmadot1su", "$vd, $vs1, $vs2">;
+def VMADOT1US : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_US, "smt.vmadot1us", "$vd, $vs1, $vs2">;
+def VMADOT2   : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_SS, "smt.vmadot2",   "$vd, $vs1, $vs2">;
+def VMADOT2U  : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_UU, "smt.vmadot2u",  "$vd, $vs1, $vs2">;
+def VMADOT2SU : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_SU, "smt.vmadot2su", "$vd, $vs1, $vs2">;
+def VMADOT2US : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_US, "smt.vmadot2us", "$vd, $vs1, $vs2">;
+def VMADOT3   : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_SS, "smt.vmadot3",   "$vd, $vs1, $vs2">;
+def VMADOT3U  : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_UU, "smt.vmadot3u",  "$vd, $vs1, $vs2">;
+def VMADOT3SU : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_SU, "smt.vmadot3su", "$vd, $vs1, $vs2">;
+def VMADOT3US : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_US, "smt.vmadot3us", "$vd, $vs1, $vs2">;
+}
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 413ad8b..a31afaa 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -692,6 +692,21 @@ def : Pat<(binop_allwusers<or>
                    (shl GPR:$op1rs1, (XLenVT 24))),
                (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
           (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+
+def : Pat<(i64 (or (or (zexti16 (XLenVT GPR:$rs1)),
+                       (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
+                   (sext_inreg (shl GPR:$op1rs1, (XLenVT 24)), i32))),
+          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+
+// Match a pattern of 2 halfwords being inserted into bits [63:32], with bits
+// bits [31:0] coming from a zero extended value. We can use pack with packw for
+// bits [63:32]. If bits [63:31] can also be a packw, it can be matched
+// separately.
+def : Pat<(or (or (shl GPR:$op1rs2, (i64 48)),
+                  (shl (zexti16 (i64 GPR:$op1rs1)), (i64 32))),
+              (zexti32 (i64 GPR:$rs1))),
+          (PACK (XLenVT GPR:$rs1),
+                (XLenVT (PACKW GPR:$op1rs1, GPR:$op1rs2)))>;
 } // Predicates = [HasStdExtZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbb, IsRV32] in
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 726920e..c7b96f5 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -63,6 +63,12 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
     Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
     Intrinsic::riscv_seg8_load_mask};
 
+static const Intrinsic::ID FixedVlssegIntrIds[] = {
+    Intrinsic::riscv_sseg2_load_mask, Intrinsic::riscv_sseg3_load_mask,
+    Intrinsic::riscv_sseg4_load_mask, Intrinsic::riscv_sseg5_load_mask,
+    Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask,
+    Intrinsic::riscv_sseg8_load_mask};
+
 static const Intrinsic::ID ScalableVlsegIntrIds[] = {
     Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
     Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
@@ -197,9 +203,15 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool RISCVTargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
   assert(Indices.size() == Shuffles.size());
+  assert(GapMask.getBitWidth() == Factor);
 
+  // We only support cases where the skipped fields are the trailing ones.
+  // TODO: Lower to strided load if there is only a single active field.
+  unsigned MaskFactor = GapMask.popcount();
+  if (MaskFactor < 2 || !GapMask.isMask())
+    return false;
   IRBuilder<> Builder(Load);
 
   const DataLayout &DL = Load->getDataLayout();
@@ -208,20 +220,37 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
 
   Value *Ptr, *VL;
   Align Alignment;
-  if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
+  if (!getMemOperands(MaskFactor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
     return false;
 
   Type *PtrTy = Ptr->getType();
   unsigned AS = PtrTy->getPointerAddressSpace();
-  if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+  if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL))
     return false;
 
-  CallInst *VlsegN = Builder.CreateIntrinsic(
-      FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+  CallInst *SegLoad = nullptr;
+  if (MaskFactor < Factor) {
+    // Lower to strided segmented load.
+    unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
+    Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+    SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2],
+                                      {VTy, PtrTy, XLenTy, XLenTy},
+                                      {Ptr, Stride, Mask, VL});
+  } else {
+    // Lower to normal segmented load.
+    SegLoad = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+                                      {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+  }
 
   for (unsigned i = 0; i < Shuffles.size(); i++) {
-    Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
-    Shuffles[i]->replaceAllUsesWith(SubVec);
+    unsigned FactorIdx = Indices[i];
+    if (FactorIdx >= MaskFactor) {
+      // Replace masked-off factors (that are still extracted) with poison.
+      Shuffles[i]->replaceAllUsesWith(PoisonValue::get(VTy));
+    } else {
+      Value *SubVec = Builder.CreateExtractValue(SegLoad, FactorIdx);
+      Shuffles[i]->replaceAllUsesWith(SubVec);
+    }
   }
 
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
index 7a2541a..d234dcf 100644
--- a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
@@ -26,6 +26,7 @@ struct RISCVMoveMerge : public MachineFunctionPass {
 
   RISCVMoveMerge() : MachineFunctionPass(ID) {}
 
+  const RISCVSubtarget *ST;
   const RISCVInstrInfo *TII;
   const TargetRegisterInfo *TRI;
 
@@ -37,15 +38,15 @@ struct RISCVMoveMerge : public MachineFunctionPass {
   // Merge the two instructions indicated into a single pair instruction.
   MachineBasicBlock::iterator
   mergePairedInsns(MachineBasicBlock::iterator I,
-                   MachineBasicBlock::iterator Paired, unsigned Opcode);
+                   MachineBasicBlock::iterator Paired, bool MoveFromSToA);
 
   // Look for C.MV instruction that can be combined with
   // the given instruction into CM.MVA01S or CM.MVSA01. Return the matching
   // instruction if one exists.
   MachineBasicBlock::iterator
-  findMatchingInst(MachineBasicBlock::iterator &MBBI, unsigned InstOpcode,
+  findMatchingInst(MachineBasicBlock::iterator &MBBI, bool MoveFromSToA,
                    const DestSourcePair &RegPair);
-  bool mergeMoveSARegPair(const RISCVSubtarget &STI, MachineBasicBlock &MBB);
+  bool mergeMoveSARegPair(MachineBasicBlock &MBB);
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
   StringRef getPassName() const override { return RISCV_MOVE_MERGE_NAME; }
@@ -58,41 +59,21 @@ char RISCVMoveMerge::ID = 0;
 INITIALIZE_PASS(RISCVMoveMerge, "riscv-move-merge", RISCV_MOVE_MERGE_NAME,
                 false, false)
 
-static bool isMoveFromAToS(unsigned Opcode) {
-  switch (Opcode) {
-  case RISCV::CM_MVA01S:
-  case RISCV::QC_CM_MVA01S:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static unsigned getMoveFromAToSOpcode(const RISCVSubtarget &STI) {
-  if (STI.hasStdExtZcmp())
+static unsigned getMoveFromSToAOpcode(const RISCVSubtarget &ST) {
+  if (ST.hasStdExtZcmp())
     return RISCV::CM_MVA01S;
 
-  if (STI.hasVendorXqccmp())
+  if (ST.hasVendorXqccmp())
     return RISCV::QC_CM_MVA01S;
 
   llvm_unreachable("Unhandled subtarget with paired A to S move.");
 }
 
-static bool isMoveFromSToA(unsigned Opcode) {
-  switch (Opcode) {
-  case RISCV::CM_MVSA01:
-  case RISCV::QC_CM_MVSA01:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static unsigned getMoveFromSToAOpcode(const RISCVSubtarget &STI) {
-  if (STI.hasStdExtZcmp())
+static unsigned getMoveFromAToSOpcode(const RISCVSubtarget &ST) {
+  if (ST.hasStdExtZcmp())
     return RISCV::CM_MVSA01;
 
-  if (STI.hasVendorXqccmp())
+  if (ST.hasVendorXqccmp())
     return RISCV::QC_CM_MVSA01;
 
   llvm_unreachable("Unhandled subtarget with paired S to A move");
@@ -123,20 +104,24 @@ bool RISCVMoveMerge::isCandidateToMergeMVSA01(const DestSourcePair &RegPair) {
 MachineBasicBlock::iterator
 RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
                                  MachineBasicBlock::iterator Paired,
-                                 unsigned Opcode) {
+                                 bool MoveFromSToA) {
   const MachineOperand *Sreg1, *Sreg2;
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
   DestSourcePair FirstPair = TII->isCopyInstrImpl(*I).value();
   DestSourcePair PairedRegs = TII->isCopyInstrImpl(*Paired).value();
-  Register ARegInFirstPair = isMoveFromAToS(Opcode)
-                                 ? FirstPair.Destination->getReg()
-                                 : FirstPair.Source->getReg();
+  Register ARegInFirstPair = MoveFromSToA ? FirstPair.Destination->getReg()
+                                          : FirstPair.Source->getReg();
 
   if (NextI == Paired)
     NextI = next_nodbg(NextI, E);
   DebugLoc DL = I->getDebugLoc();
 
+  // Make a copy so we can update the kill flag in the MoveFromSToA case. The
+  // copied operand needs to be scoped outside the if since we make a pointer
+  // to it.
+  MachineOperand PairedSource = *PairedRegs.Source;
+
   // The order of S-reg depends on which instruction holds A0, instead of
   // the order of register pair.
   // e,g.
@@ -146,10 +131,20 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
   //   mv a0, s2
   //   mv a1, s1    =>  cm.mva01s s2,s1
   bool StartWithX10 = ARegInFirstPair == RISCV::X10;
-  if (isMoveFromAToS(Opcode)) {
-    Sreg1 = StartWithX10 ? FirstPair.Source : PairedRegs.Source;
-    Sreg2 = StartWithX10 ? PairedRegs.Source : FirstPair.Source;
+  unsigned Opcode;
+  if (MoveFromSToA) {
+    // We are moving one of the copies earlier so its kill flag may become
+    // invalid. Clear the copied kill flag if there are any reads of the
+    // register between the new location and the old location.
+    for (auto It = std::next(I); It != Paired && PairedSource.isKill(); ++It)
+      if (It->readsRegister(PairedSource.getReg(), TRI))
+        PairedSource.setIsKill(false);
+
+    Opcode = getMoveFromSToAOpcode(*ST);
+    Sreg1 = StartWithX10 ? FirstPair.Source : &PairedSource;
+    Sreg2 = StartWithX10 ? &PairedSource : FirstPair.Source;
   } else {
+    Opcode = getMoveFromAToSOpcode(*ST);
     Sreg1 = StartWithX10 ? FirstPair.Destination : PairedRegs.Destination;
     Sreg2 = StartWithX10 ? PairedRegs.Destination : FirstPair.Destination;
   }
@@ -163,7 +158,7 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
 
 MachineBasicBlock::iterator
 RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI,
-                                 unsigned InstOpcode,
+                                 bool MoveFromSToA,
                                  const DestSourcePair &RegPair) {
   MachineBasicBlock::iterator E = MBBI->getParent()->end();
 
@@ -181,7 +176,7 @@ RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI,
       Register SourceReg = SecondPair->Source->getReg();
       Register DestReg = SecondPair->Destination->getReg();
 
-      if (isMoveFromAToS(InstOpcode) && isCandidateToMergeMVA01S(*SecondPair)) {
+      if (MoveFromSToA && isCandidateToMergeMVA01S(*SecondPair)) {
         // If register pair is valid and destination registers are different.
         if ((RegPair.Destination->getReg() == DestReg))
           return E;
@@ -195,8 +190,7 @@ RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI,
           return E;
 
         return I;
-      } else if (isMoveFromSToA(InstOpcode) &&
-                 isCandidateToMergeMVSA01(*SecondPair)) {
+      } else if (!MoveFromSToA && isCandidateToMergeMVSA01(*SecondPair)) {
         if ((RegPair.Source->getReg() == SourceReg) ||
             (RegPair.Destination->getReg() == DestReg))
           return E;
@@ -217,8 +211,7 @@ RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI,
 
 // Finds instructions, which could be represented as C.MV instructions and
 // merged into CM.MVA01S or CM.MVSA01.
-bool RISCVMoveMerge::mergeMoveSARegPair(const RISCVSubtarget &STI,
-                                        MachineBasicBlock &MBB) {
+bool RISCVMoveMerge::mergeMoveSARegPair(MachineBasicBlock &MBB) {
   bool Modified = false;
 
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
@@ -227,22 +220,17 @@ bool RISCVMoveMerge::mergeMoveSARegPair(const RISCVSubtarget &STI,
     // can, return Dest/Src register pair.
     auto RegPair = TII->isCopyInstrImpl(*MBBI);
     if (RegPair.has_value()) {
-      unsigned Opcode = 0;
-
-      if (isCandidateToMergeMVA01S(*RegPair))
-        Opcode = getMoveFromAToSOpcode(STI);
-      else if (isCandidateToMergeMVSA01(*RegPair))
-        Opcode = getMoveFromSToAOpcode(STI);
-      else {
+      bool MoveFromSToA = isCandidateToMergeMVA01S(*RegPair);
+      if (!MoveFromSToA && !isCandidateToMergeMVSA01(*RegPair)) {
         ++MBBI;
         continue;
       }
 
       MachineBasicBlock::iterator Paired =
-          findMatchingInst(MBBI, Opcode, RegPair.value());
+          findMatchingInst(MBBI, MoveFromSToA, RegPair.value());
       // If matching instruction can be found merge them.
       if (Paired != E) {
-        MBBI = mergePairedInsns(MBBI, Paired, Opcode);
+        MBBI = mergePairedInsns(MBBI, Paired, MoveFromSToA);
         Modified = true;
         continue;
       }
@@ -256,12 +244,12 @@ bool RISCVMoveMerge::runOnMachineFunction(MachineFunction &Fn) {
   if (skipFunction(Fn.getFunction()))
     return false;
 
-  const RISCVSubtarget *Subtarget = &Fn.getSubtarget<RISCVSubtarget>();
-  if (!(Subtarget->hasStdExtZcmp() || Subtarget->hasVendorXqccmp()))
+  ST = &Fn.getSubtarget<RISCVSubtarget>();
+  if (!ST->hasStdExtZcmp() && !ST->hasVendorXqccmp())
     return false;
 
-  TII = Subtarget->getInstrInfo();
-  TRI = Subtarget->getRegisterInfo();
+  TII = ST->getInstrInfo();
+  TRI = ST->getRegisterInfo();
   // Resize the modified and used register unit trackers.  We do this once
   // per function and then clear the register units each time we optimize a
   // move.
@@ -269,7 +257,7 @@ bool RISCVMoveMerge::runOnMachineFunction(MachineFunction &Fn) {
   UsedRegUnits.init(*TRI);
   bool Modified = false;
   for (auto &MBB : Fn)
-    Modified |= mergeMoveSARegPair(*Subtarget, MBB);
+    Modified |= mergeMoveSARegPair(MBB);
   return Modified;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 31d2b3a..f89d94f 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -673,6 +673,7 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
                                         FeatureStdExtZvfh,
                                         FeatureStdExtZvkt,
                                         FeatureStdExtZvl256b,
+                                        FeatureVendorXSMTVDot,
                                         FeatureUnalignedScalarMem]),
                                        [TuneDLenFactor2,
                                         TuneOptimizedNF2SegmentLoadStore,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 7e58b6f..8a3c8e2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -589,7 +589,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                (Lo12 & 0b11111) != 0) {
       // Prefetch instructions require the offset to be 32 byte aligned.
       MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
-    } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) {
+    } else if (Opc == RISCV::MIPS_PREF && !isUInt<9>(Val)) {
       // MIPS Prefetch instructions require the offset to be 9 bits encoded.
       MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
     } else if ((Opc == RISCV::PseudoRV32ZdinxLD ||
diff --git a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
index 5ef858a..8cf15fa 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
@@ -24,7 +24,7 @@ let SchedModel = Andes45Model in {
 
 //===----------------------------------------------------------------------===//
 // Andes 45 series CPU
-//   - 2 Interger Arithmetic and Logical Units (ALU)
+//   - 2 Integer Arithmetic and Logical Units (ALU)
 //   - Multiply / Divide Unit (MDU)
 //   - Load Store Unit (LSU)
 //   - Control and Status Register Unit (CSR)
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 5541506..24ebbc3 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -524,16 +524,33 @@ foreach mx = SchedMxListW in {
 foreach mx = SchedMxList in {
   defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
 
-  defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+  let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
+    defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  }
+
+  // Latency of vsmul: e8/e16 = 4/4/5/8, e32 = 5/5/5/8, e64 = 7/8/16/32
+  // We use the worst-case until we can split the SEW.
+  defvar VSMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c;
+  // Latency of vsmul: e8/e16/e32 = 1/2/4/8, e64 = 4/8/16/32
+  // We use the worst-case until we can split the SEW.
+  defvar VSMulOcc = ConstValueUntilLMULThenDoubleBase<"M1", 1, 4, mx>.c;
+  // TODO: change WriteVSMulV/X to be defined with LMULSEWSchedWrites
+  let Latency = VSMulLat, ReleaseAtCycles = [VSMulOcc] in {
+    defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
+  }
+
+  defvar VSShiftLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+  defvar VSShiftOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+  let Latency = VSShiftLat, ReleaseAtCycles = [VSShiftOcc] in {
+    defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+  }
 }
 
 // 13. Vector Floating-Point Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 66ce134..c70571c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -38,7 +38,6 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include <optional>
 using namespace llvm;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 67f924a..c707fb1 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1431,7 +1431,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   case Intrinsic::ctlz:
   case Intrinsic::ctpop: {
     auto LT = getTypeLegalizationCost(RetTy);
-    if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) {
+    if (ST->hasStdExtZvbb() && LT.second.isVector()) {
       unsigned Op;
       switch (ICA.getID()) {
       case Intrinsic::cttz:
@@ -1629,6 +1629,7 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   // scalarized if the legalized Src and Dst are not equal sized.
   const DataLayout &DL = this->getDataLayout();
   if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
+      !SrcLT.first.isValid() || !DstLT.first.isValid() ||
       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
                            SrcLT.second.getSizeInBits()) ||
       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
@@ -2414,6 +2415,24 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return BaseCost + SlideCost;
 }
 
+InstructionCost
+RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                               TTI::TargetCostKind CostKind,
+                                               unsigned Index) const {
+  if (isa<FixedVectorType>(Val))
+    return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
+                                                   Index);
+
+  // TODO: This code replicates what LoopVectorize.cpp used to do when asking
+  // for the cost of extracting the last lane of a scalable vector. It probably
+  // needs a more accurate cost.
+  ElementCount EC = cast<VectorType>(Val)->getElementCount();
+  assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
+  return getVectorInstrCost(Opcode, Val, CostKind,
+                            EC.getKnownMinValue() - 1 - Index, nullptr,
+                            nullptr);
+}
+
 InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 05d504c..b632f25 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -114,6 +114,9 @@ public:
   bool enableScalableVectorization() const override {
     return ST->hasVInstructions();
   }
+  bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override {
+    return ST->hasVInstructions();
+  }
   TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override {
     return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL
@@ -240,6 +243,11 @@ public:
                                      unsigned Index, const Value *Op0,
                                      const Value *Op1) const override;
 
+  InstructionCost
+  getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                   TTI::TargetCostKind CostKind,
+                                   unsigned Index) const override;
+
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 37a71e8..f973e75 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -491,8 +491,42 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   // vfirst find-first-set mask bit
   case RISCV::VCPOP_M:
   case RISCV::VFIRST_M:
+  // Vector Bit-manipulation Instructions (Zvbb)
+  // Vector And-Not
+  case RISCV::VANDN_VV:
+  case RISCV::VANDN_VX:
+  // Vector Reverse Bits in Elements
+  case RISCV::VBREV_V:
+  // Vector Reverse Bits in Bytes
+  case RISCV::VBREV8_V:
+  // Vector Reverse Bytes
+  case RISCV::VREV8_V:
+  // Vector Count Leading Zeros
+  case RISCV::VCLZ_V:
+  // Vector Count Trailing Zeros
+  case RISCV::VCTZ_V:
+  // Vector Population Count
+  case RISCV::VCPOP_V:
+  // Vector Rotate Left
+  case RISCV::VROL_VV:
+  case RISCV::VROL_VX:
+  // Vector Rotate Right
+  case RISCV::VROR_VI:
+  case RISCV::VROR_VV:
+  case RISCV::VROR_VX:
+  // Vector Carry-less Multiplication Instructions (Zvbc)
+  // Vector Carry-less Multiply
+  case RISCV::VCLMUL_VV:
+  case RISCV::VCLMUL_VX:
+  // Vector Carry-less Multiply Return High Half
+  case RISCV::VCLMULH_VV:
+  case RISCV::VCLMULH_VX:
     return MILog2SEW;
 
+  // Vector Widening Shift Left Logical (Zvbb)
+  case RISCV::VWSLL_VI:
+  case RISCV::VWSLL_VX:
+  case RISCV::VWSLL_VV:
   // Vector Widening Integer Add/Subtract
   // Def uses EEW=2*SEW . Operands use EEW=SEW.
   case RISCV::VWADDU_VV:
@@ -503,9 +537,6 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VWADD_VX:
   case RISCV::VWSUB_VV:
   case RISCV::VWSUB_VX:
-  case RISCV::VWSLL_VI:
-  case RISCV::VWSLL_VX:
-  case RISCV::VWSLL_VV:
   // Vector Widening Integer Multiply Instructions
   // Destination EEW=2*SEW. Source EEW=SEW.
   case RISCV::VWMUL_VV:
@@ -1020,12 +1051,40 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VNCLIP_WV:
   case RISCV::VNCLIP_WX:
   case RISCV::VNCLIP_WI:
-
-  // Vector Crypto
+  // Vector Bit-manipulation Instructions (Zvbb)
+  // Vector And-Not
+  case RISCV::VANDN_VV:
+  case RISCV::VANDN_VX:
+  // Vector Reverse Bits in Elements
+  case RISCV::VBREV_V:
+  // Vector Reverse Bits in Bytes
+  case RISCV::VBREV8_V:
+  // Vector Reverse Bytes
+  case RISCV::VREV8_V:
+  // Vector Count Leading Zeros
+  case RISCV::VCLZ_V:
+  // Vector Count Trailing Zeros
+  case RISCV::VCTZ_V:
+  // Vector Population Count
+  case RISCV::VCPOP_V:
+  // Vector Rotate Left
+  case RISCV::VROL_VV:
+  case RISCV::VROL_VX:
+  // Vector Rotate Right
+  case RISCV::VROR_VI:
+  case RISCV::VROR_VV:
+  case RISCV::VROR_VX:
+  // Vector Widening Shift Left Logical
   case RISCV::VWSLL_VI:
   case RISCV::VWSLL_VX:
   case RISCV::VWSLL_VV:
-
+  // Vector Carry-less Multiplication Instructions (Zvbc)
+  // Vector Carry-less Multiply
+  case RISCV::VCLMUL_VV:
+  case RISCV::VCLMUL_VX:
+  // Vector Carry-less Multiply Return High Half
+  case RISCV::VCLMULH_VV:
+  case RISCV::VCLMULH_VX:
   // Vector Mask Instructions
   // Vector Mask-Register Logical Instructions
   // vmsbf.m set-before-first mask bit
@@ -1213,34 +1272,6 @@ static bool isVectorOpUsedAsScalarOp(const MachineOperand &MO) {
   }
 }
 
-/// Return true if MI may read elements past VL.
-static bool mayReadPastVL(const MachineInstr &MI) {
-  const RISCVVPseudosTable::PseudoInfo *RVV =
-      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
-  if (!RVV)
-    return true;
-
-  switch (RVV->BaseInstr) {
-  // vslidedown instructions may read elements past VL. They are handled
-  // according to current tail policy.
-  case RISCV::VSLIDEDOWN_VI:
-  case RISCV::VSLIDEDOWN_VX:
-  case RISCV::VSLIDE1DOWN_VX:
-  case RISCV::VFSLIDE1DOWN_VF:
-
-  // vrgather instructions may read the source vector at any index < VLMAX,
-  // regardless of VL.
-  case RISCV::VRGATHER_VI:
-  case RISCV::VRGATHER_VV:
-  case RISCV::VRGATHER_VX:
-  case RISCV::VRGATHEREI16_VV:
-    return true;
-
-  default:
-    return false;
-  }
-}
-
 bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   const MCInstrDesc &Desc = MI.getDesc();
   if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags))
@@ -1301,7 +1332,8 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
     return std::nullopt;
   }
 
-  if (mayReadPastVL(UserMI)) {
+  if (RISCVII::readsPastVL(
+          TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) {
     LLVM_DEBUG(dbgs() << "    Abort because used by unsafe instruction\n");
     return std::nullopt;
   }
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
index 0ed97f5..d6b6079 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
@@ -38,8 +38,15 @@ struct CapabilityEntry {
   Capability::Capability ReqCapability;
 };
 
+struct EnvironmentEntry {
+  OperandCategory::OperandCategory Category;
+  uint32_t Value;
+  Environment::Environment AllowedEnvironment;
+};
+
 using namespace OperandCategory;
 using namespace Extension;
+using namespace Environment;
 using namespace Capability;
 using namespace InstructionSet;
 #define GET_SymbolicOperands_DECL
@@ -48,6 +55,8 @@ using namespace InstructionSet;
 #define GET_ExtensionEntries_IMPL
 #define GET_CapabilityEntries_DECL
 #define GET_CapabilityEntries_IMPL
+#define GET_EnvironmentEntries_DECL
+#define GET_EnvironmentEntries_IMPL
 #define GET_ExtendedBuiltins_DECL
 #define GET_ExtendedBuiltins_IMPL
 #include "SPIRVGenTables.inc"
@@ -133,6 +142,23 @@ getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category,
   return Capabilities;
 }
 
+EnvironmentList getSymbolicOperandAllowedEnvironments(
+    SPIRV::OperandCategory::OperandCategory Category, uint32_t Value) {
+  EnvironmentList Environments;
+  const SPIRV::EnvironmentEntry *Environment =
+      SPIRV::lookupEnvironmentByCategoryAndValue(Category, Value);
+  auto TableEnd = ArrayRef(SPIRV::EnvironmentEntries).end();
+  while (Environment && Environment->Category == Category &&
+         Environment->Value == Value) {
+    Environments.push_back(static_cast<SPIRV::Environment::Environment>(
+        Environment->AllowedEnvironment));
+    if (++Environment == TableEnd)
+      break;
+  }
+
+  return Environments;
+}
+
 CapabilityList
 getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension) {
   const SPIRV::ExtensionEntry *Entry =
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index b8c467f..c2c08f8 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -37,6 +37,11 @@ namespace Capability {
 #include "SPIRVGenTables.inc"
 } // namespace Capability
 
+namespace Environment {
+#define GET_Environment_DECL
+#include "SPIRVGenTables.inc"
+} // namespace Environment
+
 namespace SourceLanguage {
 #define GET_SourceLanguage_DECL
 #include "SPIRVGenTables.inc"
@@ -241,6 +246,7 @@ enum InstFlags {
 
 using CapabilityList = SmallVector<SPIRV::Capability::Capability, 8>;
 using ExtensionList = SmallVector<SPIRV::Extension::Extension, 8>;
+using EnvironmentList = SmallVector<SPIRV::Environment::Environment, 8>;
 
 std::string
 getSymbolicOperandMnemonic(SPIRV::OperandCategory::OperandCategory Category,
@@ -254,6 +260,8 @@ getSymbolicOperandMaxVersion(SPIRV::OperandCategory::OperandCategory Category,
 CapabilityList
 getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category,
                                uint32_t Value);
+EnvironmentList getSymbolicOperandAllowedEnvironments(
+    SPIRV::OperandCategory::OperandCategory Category, uint32_t Value);
 CapabilityList
 getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension);
 ExtensionList
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 4ec31bf..1e3f7fc 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -375,9 +375,17 @@ void SPIRVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     const MCOperand &Op = MI->getOperand(OpNo);
     if (Op.isReg())
       O << '%' << (getIDFromRegister(Op.getReg().id()) + 1);
-    else if (Op.isImm())
-      O << formatImm(Op.getImm());
-    else if (Op.isDFPImm())
+    else if (Op.isImm()) {
+      int64_t Imm = Op.getImm();
+      // For OpVectorShuffle:
+      // A Component literal may also be FFFFFFFF, which means the corresponding
+      // result component has no source and is undefined.
+      // LLVM representation of poison/undef becomes -1 when lowered to MI.
+      if (MI->getOpcode() == SPIRV::OpVectorShuffle && Imm == -1)
+        O << "0xFFFFFFFF";
+      else
+        O << formatImm(Imm);
+    } else if (Op.isDFPImm())
       O << formatImm((double)Op.getDFPImm());
     else if (Op.isExpr())
       MAI.printExpr(O, *Op.getExpr());
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index d9265f4..5a5860a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -12,7 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "SPIRVCommandLine.h"
-#include "llvm/ADT/StringRef.h"
+#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "llvm/TargetParser/Triple.h"
 #include <algorithm>
 #include <map>
 
@@ -171,3 +172,23 @@ StringRef SPIRVExtensionsParser::checkExtensions(
   }
   return StringRef();
 }
+
+std::set<SPIRV::Extension::Extension>
+SPIRVExtensionsParser::getValidExtensions(const Triple &TT) {
+  std::set<SPIRV::Extension::Extension> R;
+  SPIRV::Environment::Environment CurrentEnvironment =
+      SPIRV::Environment::Environment::EnvOpenCL;
+  if (TT.getOS() == Triple::Vulkan)
+    CurrentEnvironment = SPIRV::Environment::Environment::EnvVulkan;
+
+  for (const auto &[ExtensionName, ExtensionEnum] : SPIRVExtensionMap) {
+    EnvironmentList AllowedEnv = getSymbolicOperandAllowedEnvironments(
+        SPIRV::OperandCategory::OperandCategory::ExtensionOperand,
+        ExtensionEnum);
+
+    if (std::count(AllowedEnv.begin(), AllowedEnv.end(), CurrentEnvironment))
+      R.insert(ExtensionEnum);
+  }
+
+  return R;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h
index 3e3b22b..02e847b3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h
@@ -21,6 +21,7 @@
 
 namespace llvm {
 class StringRef;
+class Triple;
 
 /// Command line parser for toggling SPIR-V extensions.
 struct SPIRVExtensionsParser
@@ -42,6 +43,11 @@ public:
   static StringRef
   checkExtensions(const std::vector<std::string> &ExtNames,
                   std::set<SPIRV::Extension::Extension> &AllowedExtensions);
+
+  /// Returns the list of extensions that are valid for a particular
+  /// target environment (i.e., OpenCL or Vulkan).
+  static std::set<SPIRV::Extension::Extension>
+  getValidExtensions(const Triple &TT);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 5259db1..98c7709 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -220,8 +220,10 @@ private:
   bool selectConst(Register ResVReg, const SPIRVType *ResType,
                    MachineInstr &I) const;
 
-  bool selectSelect(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
-                    bool IsSigned) const;
+  bool selectSelect(Register ResVReg, const SPIRVType *ResType,
+                    MachineInstr &I) const;
+  bool selectSelectDefaultArgs(Register ResVReg, const SPIRVType *ResType,
+                               MachineInstr &I, bool IsSigned) const;
   bool selectIToF(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
                   bool IsSigned, unsigned Opcode) const;
   bool selectExt(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
@@ -510,7 +512,18 @@ bool SPIRVInstructionSelector::select(MachineInstr &I) {
       if (isTypeFoldingSupported(Def->getOpcode()) &&
           Def->getOpcode() != TargetOpcode::G_CONSTANT &&
           Def->getOpcode() != TargetOpcode::G_FCONSTANT) {
-        bool Res = selectImpl(I, *CoverageInfo);
+        bool Res = false;
+        if (Def->getOpcode() == TargetOpcode::G_SELECT) {
+          Register SelectDstReg = Def->getOperand(0).getReg();
+          Res = selectSelect(SelectDstReg, GR.getSPIRVTypeForVReg(SelectDstReg),
+                             *Def);
+          GR.invalidateMachineInstr(Def);
+          Def->removeFromParent();
+          MRI->replaceRegWith(DstReg, SelectDstReg);
+          GR.invalidateMachineInstr(&I);
+          I.removeFromParent();
+        } else
+          Res = selectImpl(I, *CoverageInfo);
         LLVM_DEBUG({
           if (!Res && Def->getOpcode() != TargetOpcode::G_CONSTANT) {
             dbgs() << "Unexpected pattern in ASSIGN_TYPE.\nInstruction: ";
@@ -2565,8 +2578,52 @@ Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes,
 
 bool SPIRVInstructionSelector::selectSelect(Register ResVReg,
                                             const SPIRVType *ResType,
-                                            MachineInstr &I,
-                                            bool IsSigned) const {
+                                            MachineInstr &I) const {
+  Register SelectFirstArg = I.getOperand(2).getReg();
+  Register SelectSecondArg = I.getOperand(3).getReg();
+  assert(ResType == GR.getSPIRVTypeForVReg(SelectFirstArg) &&
+         ResType == GR.getSPIRVTypeForVReg(SelectSecondArg));
+
+  bool IsFloatTy =
+      GR.isScalarOrVectorOfType(SelectFirstArg, SPIRV::OpTypeFloat);
+  bool IsPtrTy =
+      GR.isScalarOrVectorOfType(SelectFirstArg, SPIRV::OpTypePointer);
+  bool IsVectorTy = GR.getSPIRVTypeForVReg(SelectFirstArg)->getOpcode() ==
+                    SPIRV::OpTypeVector;
+
+  bool IsScalarBool =
+      GR.isScalarOfType(I.getOperand(1).getReg(), SPIRV::OpTypeBool);
+  unsigned Opcode;
+  if (IsVectorTy) {
+    if (IsFloatTy) {
+      Opcode = IsScalarBool ? SPIRV::OpSelectVFSCond : SPIRV::OpSelectVFVCond;
+    } else if (IsPtrTy) {
+      Opcode = IsScalarBool ? SPIRV::OpSelectVPSCond : SPIRV::OpSelectVPVCond;
+    } else {
+      Opcode = IsScalarBool ? SPIRV::OpSelectVISCond : SPIRV::OpSelectVIVCond;
+    }
+  } else {
+    if (IsFloatTy) {
+      Opcode = IsScalarBool ? SPIRV::OpSelectSFSCond : SPIRV::OpSelectVFVCond;
+    } else if (IsPtrTy) {
+      Opcode = IsScalarBool ? SPIRV::OpSelectSPSCond : SPIRV::OpSelectVPVCond;
+    } else {
+      Opcode = IsScalarBool ? SPIRV::OpSelectSISCond : SPIRV::OpSelectVIVCond;
+    }
+  }
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(I.getOperand(1).getReg())
+      .addUse(SelectFirstArg)
+      .addUse(SelectSecondArg)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectSelectDefaultArgs(Register ResVReg,
+                                                       const SPIRVType *ResType,
+                                                       MachineInstr &I,
+                                                       bool IsSigned) const {
   // To extend a bool, we need to use OpSelect between constants.
   Register ZeroReg = buildZerosVal(ResType, I);
   Register OneReg = buildOnesVal(IsSigned, ResType, I);
@@ -2598,7 +2655,7 @@ bool SPIRVInstructionSelector::selectIToF(Register ResVReg,
       TmpType = GR.getOrCreateSPIRVVectorType(TmpType, NumElts, I, TII);
     }
     SrcReg = createVirtualRegister(TmpType, &GR, MRI, MRI->getMF());
-    selectSelect(SrcReg, TmpType, I, false);
+    selectSelectDefaultArgs(SrcReg, TmpType, I, false);
   }
   return selectOpWithSrcs(ResVReg, ResType, I, {SrcReg}, Opcode);
 }
@@ -2608,7 +2665,7 @@ bool SPIRVInstructionSelector::selectExt(Register ResVReg,
                                          MachineInstr &I, bool IsSigned) const {
   Register SrcReg = I.getOperand(1).getReg();
   if (GR.isScalarOrVectorOfType(SrcReg, SPIRV::OpTypeBool))
-    return selectSelect(ResVReg, ResType, I, IsSigned);
+    return selectSelectDefaultArgs(ResVReg, ResType, I, IsSigned);
 
   SPIRVType *SrcType = GR.getSPIRVTypeForVReg(SrcReg);
   if (SrcType == ResType)
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
index 0398e52..aea3397 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
@@ -15,7 +15,6 @@
 
 #include "SPIRV.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index b62db7f..1a08c6a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -441,13 +441,10 @@ void insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpvType,
   // Tablegen definition assumes SPIRV::ASSIGN_TYPE pseudo-instruction is
   // present after each auto-folded instruction to take a type reference from.
   Register NewReg = MRI.createGenericVirtualRegister(MRI.getType(Reg));
-  if (auto *RC = MRI.getRegClassOrNull(Reg)) {
-    MRI.setRegClass(NewReg, RC);
-  } else {
-    auto RegClass = GR->getRegClass(SpvType);
-    MRI.setRegClass(NewReg, RegClass);
-    MRI.setRegClass(Reg, RegClass);
-  }
+  const auto *RegClass = GR->getRegClass(SpvType);
+  MRI.setRegClass(NewReg, RegClass);
+  MRI.setRegClass(Reg, RegClass);
+
   GR->assignSPIRVTypeToVReg(SpvType, Reg, MIB.getMF());
   // This is to make it convenient for Legalizer to get the SPIRVType
   // when processing the actual MI (i.e. not pseudo one).
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 74aec4f..2b34f61 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -359,18 +359,15 @@ static void lowerExpectAssume(IntrinsicInst *II) {
   }
 }
 
-static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID,
-                                     ArrayRef<unsigned> OpNos) {
-  Function *F = nullptr;
-  if (OpNos.empty()) {
-    F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID);
-  } else {
-    SmallVector<Type *, 4> Tys;
-    for (unsigned OpNo : OpNos)
-      Tys.push_back(II->getOperand(OpNo)->getType());
-    F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID, Tys);
-  }
-  II->setCalledFunction(F);
+static bool toSpvLifetimeIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID) {
+  IRBuilder<> Builder(II);
+  auto *Alloca = cast<AllocaInst>(II->getArgOperand(0));
+  std::optional<TypeSize> Size =
+      Alloca->getAllocationSize(Alloca->getDataLayout());
+  Value *SizeVal = Builder.getInt64(Size ? *Size : -1);
+  Builder.CreateIntrinsic(NewID, Alloca->getType(),
+                          {SizeVal, II->getArgOperand(0)});
+  II->eraseFromParent();
   return true;
 }
 
@@ -406,8 +403,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
         break;
       case Intrinsic::lifetime_start:
         if (!STI.isShader()) {
-          Changed |= toSpvOverloadedIntrinsic(
-              II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1});
+          Changed |= toSpvLifetimeIntrinsic(
+              II, Intrinsic::SPVIntrinsics::spv_lifetime_start);
         } else {
           II->eraseFromParent();
           Changed = true;
@@ -415,8 +412,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
         break;
       case Intrinsic::lifetime_end:
         if (!STI.isShader()) {
-          Changed |= toSpvOverloadedIntrinsic(
-              II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
+          Changed |= toSpvLifetimeIntrinsic(
+              II, Intrinsic::SPVIntrinsics::spv_lifetime_end);
         } else {
           II->eraseFromParent();
           Changed = true;
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index cdf3c62..690493fb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -166,7 +166,13 @@ void SPIRVSubtarget::initAvailableExtInstSets() {
 void SPIRVSubtarget::initAvailableExtensions(
     const std::set<SPIRV::Extension::Extension> &AllowedExtIds) {
   AvailableExtensions.clear();
-  AvailableExtensions.insert_range(AllowedExtIds);
+  const std::set<SPIRV::Extension::Extension> &ValidExtensions =
+      SPIRVExtensionsParser::getValidExtensions(TargetTriple);
+
+  for (const auto &Ext : AllowedExtIds) {
+    if (ValidExtensions.count(Ext))
+      AvailableExtensions.insert(Ext);
+  }
 
   accountForAMDShaderTrinaryMinmax();
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 614e83a..d2824ee 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -110,22 +110,58 @@ def CapabilityEntries : GenericTable {
 }
 
 //===----------------------------------------------------------------------===//
+// Lookup table for matching symbolic operands (category + 32-bit value) to
+// SPIR-V environments. If an operand is allows in more than one environment,
+// there will be multiple consecutive entries present in the table.
+//===----------------------------------------------------------------------===//
+
+// Forward-declare classes used in ExtensionEntry
+class Environment;
+
+class EnvironmentEntry<OperandCategory category, bits<32> value,
+                       Environment allowedEnvironment> {
+  OperandCategory Category = category;
+  bits<32> Value = value;
+  Environment AllowedEnvironment = allowedEnvironment;
+}
+
+def EnvironmentEntries : GenericTable {
+  let FilterClass = "EnvironmentEntry";
+  let Fields = ["Category", "Value", "AllowedEnvironment"];
+  string TypeOf_Category = "OperandCategory";
+  string TypeOf_AllowedEnvironment = "Environment";
+  let PrimaryKey = ["Category", "Value"];
+  // Function for looking up a (the first) environment by category + value. Next
+  // environment should be consecutive.
+  let PrimaryKeyName = "lookupEnvironmentByCategoryAndValue";
+}
+
+//===----------------------------------------------------------------------===//
 // Multiclass used to define a SymbolicOperand and at the same time declare
 // required extension and capabilities.
 //===----------------------------------------------------------------------===//
 
-multiclass SymbolicOperandWithRequirements<OperandCategory category, bits<32> value, string mnemonic, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
-    assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided for symbolic operand with value " # value;
-    def : SymbolicOperand<category, value, mnemonic, minVersion, maxVersion>;
+multiclass SymbolicOperandWithRequirements<
+    OperandCategory category, bits<32> value, string mnemonic,
+    bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions,
+    list<Capability> reqCapabilities, list<Environment> allowedEnvironments> {
+  assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided "
+                                  "for symbolic operand with value "#value;
+  def : SymbolicOperand<category, value, mnemonic, minVersion, maxVersion>;
+
+  assert !le(!size(reqExtensions), 1),
+      "Too many required extensions for a symbolic/named operand: "#mnemonic;
+  if !eq(!size(reqExtensions), 1) then {
+    def : ExtensionEntry<category, value, reqExtensions[0]>;
+  }
 
-    assert !le(!size(reqExtensions), 1), "Too many required extensions for a symbolic/named operand: " # mnemonic;
-    if !eq(!size(reqExtensions), 1) then {
-        def : ExtensionEntry<category, value, reqExtensions[0]>;
-    }
+  foreach capability = reqCapabilities in {
+    def : CapabilityEntry<category, value, capability>;
+  }
 
-    foreach capability = reqCapabilities in {
-        def : CapabilityEntry<category, value, capability>;
-    }
+  foreach environment = allowedEnvironments in {
+    def : EnvironmentEntry<category, value, environment>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -176,6 +212,20 @@ def SpecConstantOpOperandsOperand : OperandCategory;
 def MatrixMultiplyAccumulateOperandsOperand : OperandCategory;
 
 //===----------------------------------------------------------------------===//
+// Definition of the Environments
+//===----------------------------------------------------------------------===//
+
+def Environment : GenericEnum, Operand<i32> {
+  let FilterClass = "Environment";
+  let ValueField = "Value";
+}
+
+class Environment<bits<32> value> { bits<32> Value = value; }
+
+def EnvOpenCL : Environment<0>;
+def EnvVulkan : Environment<1>;
+
+//===----------------------------------------------------------------------===//
 // Multiclass used to define Extesions enum values and at the same time
 // SymbolicOperand entries.
 //===----------------------------------------------------------------------===//
@@ -192,135 +242,146 @@ class Extension<string name, bits<32> value> {
   bits<32> Value = value;
 }
 
-multiclass ExtensionOperand<bits<32> value> {
+multiclass ExtensionOperand<bits<32> value,
+                            list<Environment> allowedEnvironments> {
   def NAME : Extension<NAME, value>;
-  defm : SymbolicOperandWithRequirements<ExtensionOperand, value, NAME, 0, 0, [], []>;
-}
-
-defm SPV_AMD_shader_explicit_vertex_parameter : ExtensionOperand<1>;
-defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2>;
-defm SPV_AMD_gcn_shader : ExtensionOperand<3>;
-defm SPV_KHR_shader_ballot : ExtensionOperand<4>;
-defm SPV_AMD_shader_ballot : ExtensionOperand<5>;
-defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6>;
-defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7>;
-defm SPV_KHR_subgroup_vote : ExtensionOperand<8>;
-defm SPV_KHR_16bit_storage : ExtensionOperand<9>;
-defm SPV_KHR_device_group : ExtensionOperand<10>;
-defm SPV_KHR_multiview : ExtensionOperand<11>;
-defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12>;
-defm SPV_NV_viewport_array2 : ExtensionOperand<13>;
-defm SPV_NV_stereo_view_rendering : ExtensionOperand<14>;
-defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15>;
-defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16>;
-defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17>;
-defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18>;
-defm SPV_KHR_variable_pointers : ExtensionOperand<19>;
-defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20>;
-defm SPV_KHR_post_depth_coverage : ExtensionOperand<21>;
-defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22>;
-defm SPV_EXT_shader_stencil_export : ExtensionOperand<23>;
-defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24>;
-defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25>;
-defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26>;
-defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27>;
-defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28>;
-defm SPV_GOOGLE_decorate_string : ExtensionOperand<29>;
-defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30>;
-defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31>;
-defm SPV_EXT_descriptor_indexing : ExtensionOperand<32>;
-defm SPV_KHR_8bit_storage : ExtensionOperand<33>;
-defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34>;
-defm SPV_NV_ray_tracing : ExtensionOperand<35>;
-defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36>;
-defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37>;
-defm SPV_NV_mesh_shader : ExtensionOperand<38>;
-defm SPV_NV_shader_image_footprint : ExtensionOperand<39>;
-defm SPV_NV_shading_rate : ExtensionOperand<40>;
-defm SPV_INTEL_subgroups : ExtensionOperand<41>;
-defm SPV_INTEL_media_block_io : ExtensionOperand<42>;
-defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44>;
-defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45>;
-defm SPV_KHR_float_controls : ExtensionOperand<46>;
-defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47>;
-defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48>;
-defm SPV_NV_cooperative_matrix : ExtensionOperand<49>;
-defm SPV_INTEL_shader_integer_functions2 : ExtensionOperand<50>;
-defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51>;
-defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52>;
-defm SPV_NV_shader_sm_builtins : ExtensionOperand<53>;
-defm SPV_KHR_shader_clock : ExtensionOperand<54>;
-defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55>;
-defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56>;
-defm SPV_INTEL_fpga_reg : ExtensionOperand<57>;
-defm SPV_INTEL_blocking_pipes : ExtensionOperand<58>;
-defm SPV_GOOGLE_user_type : ExtensionOperand<59>;
-defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60>;
-defm SPV_INTEL_kernel_attributes : ExtensionOperand<61>;
-defm SPV_KHR_non_semantic_info : ExtensionOperand<62>;
-defm SPV_INTEL_io_pipes : ExtensionOperand<63>;
-defm SPV_KHR_ray_tracing : ExtensionOperand<64>;
-defm SPV_KHR_ray_query : ExtensionOperand<65>;
-defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66>;
-defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67>;
-defm SPV_EXT_shader_atomic_float_add : ExtensionOperand<68>;
-defm SPV_KHR_terminate_invocation : ExtensionOperand<69>;
-defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70>;
-defm SPV_EXT_shader_image_int64 : ExtensionOperand<71>;
-defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72>;
-defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73>;
-defm SPV_INTEL_loop_fuse : ExtensionOperand<74>;
-defm SPV_EXT_shader_atomic_float_min_max : ExtensionOperand<75>;
-defm SPV_KHR_workgroup_memory_explicit_layout : ExtensionOperand<76>;
-defm SPV_KHR_linkonce_odr : ExtensionOperand<77>;
-defm SPV_KHR_expect_assume : ExtensionOperand<78>;
-defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79>;
-defm SPV_NV_bindless_texture : ExtensionOperand<80>;
-defm SPV_INTEL_fpga_invocation_pipelining_attributes : ExtensionOperand<81>;
-defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82>;
-defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83>;
-defm SPV_KHR_integer_dot_product : ExtensionOperand<84>;
-defm SPV_EXT_shader_atomic_float16_add : ExtensionOperand<85>;
-defm SPV_INTEL_runtime_aligned : ExtensionOperand<86>;
-defm SPV_KHR_bit_instructions : ExtensionOperand<87>;
-defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88>;
-defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89>;
-defm SPV_KHR_subgroup_rotate : ExtensionOperand<90>;
-defm SPV_INTEL_split_barrier : ExtensionOperand<91>;
-defm SPV_KHR_ray_cull_mask : ExtensionOperand<92>;
-defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93>;
-defm SPV_EXT_relaxed_printf_string_address_space : ExtensionOperand<94>;
-defm SPV_EXT_ycbcr_attachments : ExtensionOperand<95>;
-defm SPV_EXT_mesh_shader : ExtensionOperand<96>;
-defm SPV_ARM_core_builtins : ExtensionOperand<97>;
-defm SPV_EXT_opacity_micromap : ExtensionOperand<98>;
-defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99>;
-defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100>;
-defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101>;
-defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102>;
-defm SPV_INTEL_optnone : ExtensionOperand<103>;
-defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
-defm SPV_INTEL_variable_length_array : ExtensionOperand<105>;
-defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>;
-defm SPV_INTEL_inline_assembly : ExtensionOperand<107>;
-defm SPV_INTEL_cache_controls : ExtensionOperand<108>;
-defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>;
-defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>;
-defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>;
-defm SPV_EXT_arithmetic_fence : ExtensionOperand<112>;
-defm SPV_EXT_optnone : ExtensionOperand<113>;
-defm SPV_INTEL_joint_matrix : ExtensionOperand<114>;
-defm SPV_INTEL_float_controls2 : ExtensionOperand<115>;
-defm SPV_INTEL_bindless_images : ExtensionOperand<116>;
-defm SPV_INTEL_long_composites : ExtensionOperand<117>;
-defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118>;
-defm SPV_INTEL_fp_max_error : ExtensionOperand<119>;
-defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120>;
-defm SPV_INTEL_subgroup_matrix_multiply_accumulate : ExtensionOperand<121>;
-defm SPV_INTEL_2d_block_io : ExtensionOperand<122>;
-defm SPV_INTEL_int4 : ExtensionOperand<123>;
-defm SPV_KHR_float_controls2 : ExtensionOperand<124>;
-defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125>;
+  defm : SymbolicOperandWithRequirements<ExtensionOperand, value, NAME, 0,
+                                         0, [], [], allowedEnvironments>;
+}
+
+defm SPV_AMD_shader_explicit_vertex_parameter
+    : ExtensionOperand<1, [EnvVulkan]>;
+defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2, [EnvVulkan]>;
+defm SPV_AMD_gcn_shader : ExtensionOperand<3, [EnvVulkan]>;
+defm SPV_KHR_shader_ballot : ExtensionOperand<4, [EnvVulkan]>;
+defm SPV_AMD_shader_ballot : ExtensionOperand<5, [EnvVulkan]>;
+defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6, [EnvVulkan]>;
+defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7, [EnvVulkan]>;
+defm SPV_KHR_subgroup_vote : ExtensionOperand<8, [EnvVulkan]>;
+defm SPV_KHR_16bit_storage : ExtensionOperand<9, [EnvVulkan]>;
+defm SPV_KHR_device_group : ExtensionOperand<10, [EnvVulkan]>;
+defm SPV_KHR_multiview : ExtensionOperand<11, [EnvVulkan]>;
+defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12, [EnvVulkan]>;
+defm SPV_NV_viewport_array2 : ExtensionOperand<13, [EnvVulkan]>;
+defm SPV_NV_stereo_view_rendering : ExtensionOperand<14, [EnvVulkan]>;
+defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15, [EnvVulkan]>;
+defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16, [EnvVulkan]>;
+defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17, [EnvVulkan]>;
+defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18, [EnvVulkan]>;
+defm SPV_KHR_variable_pointers : ExtensionOperand<19, [EnvVulkan]>;
+defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20, [EnvVulkan]>;
+defm SPV_KHR_post_depth_coverage : ExtensionOperand<21, [EnvVulkan]>;
+defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22, []>;
+defm SPV_EXT_shader_stencil_export : ExtensionOperand<23, [EnvVulkan]>;
+defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24, [EnvVulkan]>;
+defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25, [EnvVulkan]>;
+defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26, [EnvVulkan]>;
+defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27, [EnvVulkan]>;
+defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28, [EnvVulkan]>;
+defm SPV_GOOGLE_decorate_string : ExtensionOperand<29, [EnvVulkan]>;
+defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30, [EnvVulkan]>;
+defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31, [EnvVulkan]>;
+defm SPV_EXT_descriptor_indexing : ExtensionOperand<32, [EnvVulkan]>;
+defm SPV_KHR_8bit_storage : ExtensionOperand<33, [EnvVulkan]>;
+defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34, [EnvVulkan]>;
+defm SPV_NV_ray_tracing : ExtensionOperand<35, [EnvVulkan]>;
+defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36, [EnvVulkan]>;
+defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37, [EnvVulkan]>;
+defm SPV_NV_mesh_shader : ExtensionOperand<38, [EnvVulkan]>;
+defm SPV_NV_shader_image_footprint : ExtensionOperand<39, [EnvVulkan]>;
+defm SPV_NV_shading_rate : ExtensionOperand<40, [EnvVulkan]>;
+defm SPV_INTEL_subgroups : ExtensionOperand<41, [EnvOpenCL]>;
+defm SPV_INTEL_media_block_io : ExtensionOperand<42, [EnvOpenCL]>;
+defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44, [EnvVulkan]>;
+defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45, [EnvOpenCL]>;
+defm SPV_KHR_float_controls : ExtensionOperand<46, [EnvVulkan, EnvOpenCL]>;
+defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47, [EnvVulkan]>;
+defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48, [EnvOpenCL]>;
+defm SPV_NV_cooperative_matrix : ExtensionOperand<49, [EnvVulkan]>;
+defm SPV_INTEL_shader_integer_functions2
+    : ExtensionOperand<50, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51, [EnvOpenCL]>;
+defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52, [EnvVulkan]>;
+defm SPV_NV_shader_sm_builtins : ExtensionOperand<53, [EnvVulkan]>;
+defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>;
+defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>;
+defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>;
+defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
+defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>;
+defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>;
+defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>;
+defm SPV_KHR_non_semantic_info : ExtensionOperand<62, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_io_pipes : ExtensionOperand<63, [EnvOpenCL]>;
+defm SPV_KHR_ray_tracing : ExtensionOperand<64, [EnvVulkan]>;
+defm SPV_KHR_ray_query : ExtensionOperand<65, [EnvVulkan]>;
+defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66, [EnvOpenCL]>;
+defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67, [EnvOpenCL]>;
+defm SPV_EXT_shader_atomic_float_add
+    : ExtensionOperand<68, [EnvVulkan, EnvOpenCL]>;
+defm SPV_KHR_terminate_invocation : ExtensionOperand<69, [EnvVulkan]>;
+defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70, [EnvVulkan]>;
+defm SPV_EXT_shader_image_int64 : ExtensionOperand<71, [EnvVulkan]>;
+defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72, [EnvOpenCL]>;
+defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73, [EnvOpenCL]>;
+defm SPV_INTEL_loop_fuse : ExtensionOperand<74, [EnvOpenCL]>;
+defm SPV_EXT_shader_atomic_float_min_max
+    : ExtensionOperand<75, [EnvVulkan, EnvOpenCL]>;
+defm SPV_KHR_workgroup_memory_explicit_layout
+    : ExtensionOperand<76, [EnvVulkan]>;
+defm SPV_KHR_linkonce_odr : ExtensionOperand<77, [EnvOpenCL]>;
+defm SPV_KHR_expect_assume : ExtensionOperand<78, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79, [EnvOpenCL]>;
+defm SPV_NV_bindless_texture : ExtensionOperand<80, [EnvVulkan]>;
+defm SPV_INTEL_fpga_invocation_pipelining_attributes
+    : ExtensionOperand<81, [EnvOpenCL]>;
+defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82, [EnvVulkan]>;
+defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83, [EnvVulkan]>;
+defm SPV_KHR_integer_dot_product : ExtensionOperand<84, [EnvVulkan, EnvOpenCL]>;
+defm SPV_EXT_shader_atomic_float16_add
+    : ExtensionOperand<85, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_runtime_aligned : ExtensionOperand<86, [EnvOpenCL]>;
+defm SPV_KHR_bit_instructions : ExtensionOperand<87, [EnvOpenCL]>;
+defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88, [EnvVulkan]>;
+defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89, [EnvOpenCL]>;
+defm SPV_KHR_subgroup_rotate : ExtensionOperand<90, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_split_barrier : ExtensionOperand<91, [EnvOpenCL]>;
+defm SPV_KHR_ray_cull_mask : ExtensionOperand<92, [EnvVulkan]>;
+defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93, [EnvVulkan]>;
+defm SPV_EXT_relaxed_printf_string_address_space
+    : ExtensionOperand<94, [EnvOpenCL]>;
+defm SPV_EXT_mesh_shader : ExtensionOperand<96, [EnvVulkan]>;
+defm SPV_ARM_core_builtins : ExtensionOperand<97, [EnvVulkan]>;
+defm SPV_EXT_opacity_micromap : ExtensionOperand<98, [EnvVulkan]>;
+defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99, [EnvVulkan]>;
+defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100, [EnvOpenCL]>;
+defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101, [EnvOpenCL]>;
+defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102, [EnvOpenCL]>;
+defm SPV_INTEL_optnone : ExtensionOperand<103, [EnvOpenCL]>;
+defm SPV_INTEL_function_pointers : ExtensionOperand<104, [EnvOpenCL]>;
+defm SPV_INTEL_variable_length_array : ExtensionOperand<105, [EnvOpenCL]>;
+defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106, [EnvOpenCL]>;
+defm SPV_INTEL_inline_assembly : ExtensionOperand<107, [EnvOpenCL]>;
+defm SPV_INTEL_cache_controls : ExtensionOperand<108, [EnvOpenCL]>;
+defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109, [EnvOpenCL]>;
+defm SPV_INTEL_global_variable_fpga_decorations
+    : ExtensionOperand<110, [EnvOpenCL]>;
+defm SPV_KHR_cooperative_matrix : ExtensionOperand<111, [EnvVulkan, EnvOpenCL]>;
+defm SPV_EXT_arithmetic_fence : ExtensionOperand<112, [EnvOpenCL]>;
+defm SPV_EXT_optnone : ExtensionOperand<113, [EnvOpenCL]>;
+defm SPV_INTEL_joint_matrix : ExtensionOperand<114, [EnvOpenCL]>;
+defm SPV_INTEL_float_controls2 : ExtensionOperand<115, [EnvOpenCL]>;
+defm SPV_INTEL_bindless_images : ExtensionOperand<116, [EnvOpenCL]>;
+defm SPV_INTEL_long_composites : ExtensionOperand<117, [EnvOpenCL]>;
+defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118, [EnvOpenCL]>;
+defm SPV_INTEL_fp_max_error : ExtensionOperand<119, [EnvOpenCL]>;
+defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120, [EnvOpenCL]>;
+defm SPV_INTEL_subgroup_matrix_multiply_accumulate
+    : ExtensionOperand<121, [EnvOpenCL]>;
+defm SPV_INTEL_2d_block_io : ExtensionOperand<122, [EnvOpenCL]>;
+defm SPV_INTEL_int4 : ExtensionOperand<123, [EnvOpenCL]>;
+defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -342,7 +403,9 @@ class Capability<string name, bits<32> value> {
 
 multiclass CapabilityOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def NAME : Capability<NAME, value>;
-  defm : SymbolicOperandWithRequirements<CapabilityOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<CapabilityOperand, value, NAME,
+                                         minVersion, maxVersion, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm Matrix : CapabilityOperand<0, 0, 0, [], []>;
@@ -551,7 +614,8 @@ class SourceLanguage<string name, bits<32> value> {
 
 multiclass SourceLanguageOperand<bits<32> value> {
   def : SourceLanguage<NAME, value>;
-  defm : SymbolicOperandWithRequirements<SourceLanguageOperand, value, NAME, 0, 0, [], []>;
+  defm : SymbolicOperandWithRequirements<SourceLanguageOperand, value, NAME, 0,
+                                         0, [], [], []>;
 }
 
 defm Unknown : SourceLanguageOperand<0>;
@@ -580,7 +644,8 @@ class AddressingModel<string name, bits<32> value> {
 
 multiclass AddressingModelOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : AddressingModel<NAME, value>;
-  defm : SymbolicOperandWithRequirements<AddressingModelOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<AddressingModelOperand, value, NAME, 0,
+                                         0, [], reqCapabilities, []>;
 }
 
 defm Logical : AddressingModelOperand<0, []>;
@@ -607,7 +672,8 @@ class ExecutionModel<string name, bits<32> value> {
 
 multiclass ExecutionModelOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : ExecutionModel<NAME, value>;
-  defm : SymbolicOperandWithRequirements<ExecutionModelOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<ExecutionModelOperand, value, NAME, 0,
+                                         0, [], reqCapabilities, []>;
 }
 
 defm Vertex : ExecutionModelOperand<0, [Shader]>;
@@ -645,7 +711,8 @@ class MemoryModel<string name, bits<32> value> {
 
 multiclass MemoryModelOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : MemoryModel<NAME, value>;
-  defm : SymbolicOperandWithRequirements<MemoryModelOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<MemoryModelOperand, value, NAME, 0,
+                                         0, [], reqCapabilities, []>;
 }
 
 defm Simple : MemoryModelOperand<0, [Shader]>;
@@ -672,7 +739,8 @@ class ExecutionMode<string name, bits<32> value> {
 
 multiclass ExecutionModeOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : ExecutionMode<NAME, value>;
-  defm : SymbolicOperandWithRequirements<ExecutionModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<ExecutionModeOperand, value, NAME, 0,
+                                         0, [], reqCapabilities, []>;
 }
 
 defm Invocations : ExecutionModeOperand<0, [Geometry]>;
@@ -748,7 +816,8 @@ class StorageClass<string name, bits<32> value> {
 
 multiclass StorageClassOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : StorageClass<NAME, value>;
-  defm : SymbolicOperandWithRequirements<StorageClassOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<StorageClassOperand, value, NAME, 0, 0,
+                                         reqExtensions, reqCapabilities, []>;
 }
 
 defm UniformConstant : StorageClassOperand<0, [], []>;
@@ -794,7 +863,8 @@ class Dim<string name, bits<32> value> {
 
 multiclass DimOperand<bits<32> value, string mnemonic, list<Capability> reqCapabilities> {
   def NAME : Dim<NAME, value>;
-  defm : SymbolicOperandWithRequirements<DimOperand, value, mnemonic, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<DimOperand, value, mnemonic, 0, 0, [],
+                                         reqCapabilities, []>;
 }
 
 defm DIM_1D : DimOperand<0, "1D", [Sampled1D, Image1D]>;
@@ -824,7 +894,8 @@ class SamplerAddressingMode<string name, bits<32> value> {
 
 multiclass SamplerAddressingModeOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : SamplerAddressingMode<NAME, value>;
-  defm : SymbolicOperandWithRequirements<SamplerAddressingModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<SamplerAddressingModeOperand, value,
+                                         NAME, 0, 0, [], reqCapabilities, []>;
 }
 
 defm None : SamplerAddressingModeOperand<0, [Kernel]>;
@@ -852,7 +923,8 @@ class SamplerFilterMode<string name, bits<32> value> {
 
 multiclass SamplerFilterModeOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : SamplerFilterMode<NAME, value>;
-  defm : SymbolicOperandWithRequirements<SamplerFilterModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<SamplerFilterModeOperand, value, NAME,
+                                         0, 0, [], reqCapabilities, []>;
 }
 
 defm Nearest : SamplerFilterModeOperand<0, [Kernel]>;
@@ -877,7 +949,8 @@ class ImageFormat<string name, bits<32> value> {
 
 multiclass ImageFormatOperand<bits<32> value, list<Capability> reqCapabilities> {
   def NAME : ImageFormat<NAME, value>;
-  defm : SymbolicOperandWithRequirements<ImageFormatOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<ImageFormatOperand, value, NAME, 0,
+                                         0, [], reqCapabilities, []>;
 }
 
 defm Unknown : ImageFormatOperand<0, []>;
@@ -940,7 +1013,8 @@ class ImageChannelOrder<string name, bits<32> value> {
 
 multiclass ImageChannelOrderOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : ImageChannelOrder<NAME, value>;
-  defm : SymbolicOperandWithRequirements<ImageChannelOrderOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<ImageChannelOrderOperand, value, NAME,
+                                         0, 0, [], reqCapabilities, []>;
 }
 
 defm R : ImageChannelOrderOperand<0, [Kernel]>;
@@ -983,7 +1057,8 @@ class ImageChannelDataType<string name, bits<32> value> {
 
 multiclass ImageChannelDataTypeOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : ImageChannelDataType<NAME, value>;
-  defm : SymbolicOperandWithRequirements<ImageChannelDataTypeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<ImageChannelDataTypeOperand, value,
+                                         NAME, 0, 0, [], reqCapabilities, []>;
 }
 
 defm SnormInt8 : ImageChannelDataTypeOperand<0, []>;
@@ -1023,7 +1098,8 @@ class ImageOperand<string name, bits<32> value> {
 
 multiclass ImageOperandOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : ImageOperand<NAME, value>;
-  defm : SymbolicOperandWithRequirements<ImageOperandOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<ImageOperandOperand, value, NAME, 0,
+                                         0, [], reqCapabilities, []>;
 }
 
 defm None : ImageOperandOperand<0x0, []>;
@@ -1061,7 +1137,8 @@ class FPFastMathMode<string name, bits<32> value> {
 
 multiclass FPFastMathModeOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : FPFastMathMode<NAME, value>;
-  defm : SymbolicOperandWithRequirements<FPFastMathModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<FPFastMathModeOperand, value, NAME, 0,
+                                         0, [], reqCapabilities, []>;
 }
 
 defm None : FPFastMathModeOperand<0x0, []>;
@@ -1090,7 +1167,8 @@ class FPRoundingMode<string name, bits<32> value> {
 
 multiclass FPRoundingModeOperand<bits<32> value> {
   def NAME : FPRoundingMode<NAME, value>;
-  defm : SymbolicOperandWithRequirements<FPRoundingModeOperand, value, NAME, 0, 0, [], []>;
+  defm : SymbolicOperandWithRequirements<FPRoundingModeOperand, value, NAME, 0,
+                                         0, [], [], []>;
 }
 
 defm RTE : FPRoundingModeOperand<0>;
@@ -1117,7 +1195,8 @@ class LinkageType<string name, bits<32> value> {
 
 multiclass LinkageTypeOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : LinkageType<NAME, value>;
-  defm : SymbolicOperandWithRequirements<LinkageTypeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<LinkageTypeOperand, value, NAME, 0,
+                                         0, [], reqCapabilities, []>;
 }
 
 defm Export : LinkageTypeOperand<0, [Linkage]>;
@@ -1143,7 +1222,8 @@ class AccessQualifier<string name, bits<32> value> {
 
 multiclass AccessQualifierOperand<bits<32> value, list<Capability> reqCapabilities> {
   def NAME : AccessQualifier<NAME, value>;
-  defm : SymbolicOperandWithRequirements<AccessQualifierOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<AccessQualifierOperand, value, NAME, 0,
+                                         0, [], reqCapabilities, []>;
 }
 
 defm ReadOnly : AccessQualifierOperand<0, [Kernel]>;
@@ -1170,7 +1250,9 @@ class FunctionParameterAttribute<string name, bits<32> value> {
 
 multiclass FunctionParameterAttributeOperand<bits<32> value, list<Capability> reqCapabilities> {
   def : FunctionParameterAttribute<NAME, value>;
-  defm : SymbolicOperandWithRequirements<FunctionParameterAttributeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<FunctionParameterAttributeOperand,
+                                         value, NAME, 0, 0, [],
+                                         reqCapabilities, []>;
 }
 
 defm Zext : FunctionParameterAttributeOperand<0, [Kernel]>;
@@ -1202,7 +1284,9 @@ class Decoration<string name, bits<32> value> {
 
 multiclass DecorationOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : Decoration<NAME, value>;
-  defm : SymbolicOperandWithRequirements<DecorationOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<DecorationOperand, value, NAME,
+                                         minVersion, maxVersion, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm RelaxedPrecision : DecorationOperand<0, 0, 0, [], [Shader]>;
@@ -1303,7 +1387,9 @@ class BuiltIn<string name, bits<32> value> {
 
 multiclass BuiltInOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def NAME : BuiltIn<NAME, value>;
-  defm : SymbolicOperandWithRequirements<BuiltInOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<BuiltInOperand, value, NAME,
+                                         minVersion, maxVersion, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm Position : BuiltInOperand<0, 0, 0, [], [Shader]>;
@@ -1417,7 +1503,8 @@ class SelectionControl<string name, bits<32> value> {
 
 multiclass SelectionControlOperand<bits<32> value> {
   def : SelectionControl<NAME, value>;
-  defm : SymbolicOperandWithRequirements<SelectionControlOperand, value, NAME, 0, 0, [], []>;
+  defm : SymbolicOperandWithRequirements<SelectionControlOperand, value, NAME,
+                                         0, 0, [], [], []>;
 }
 
 defm None : SelectionControlOperand<0x0>;
@@ -1443,7 +1530,8 @@ class LoopControl<string name, bits<32> value> {
 
 multiclass LoopControlOperand<bits<32> value> {
   def : LoopControl<NAME, value>;
-  defm : SymbolicOperandWithRequirements<LoopControlOperand, value, NAME, 0, 0, [], []>;
+  defm : SymbolicOperandWithRequirements<LoopControlOperand, value, NAME, 0,
+                                         0, [], [], []>;
 }
 
 defm None : LoopControlOperand<0x0>;
@@ -1476,7 +1564,8 @@ class FunctionControl<string name, bits<32> value> {
 
 multiclass FunctionControlOperand<bits<32> value> {
   def : FunctionControl<NAME, value>;
-  defm : SymbolicOperandWithRequirements<FunctionControlOperand, value, NAME, 0, 0, [], []>;
+  defm : SymbolicOperandWithRequirements<FunctionControlOperand, value, NAME, 0,
+                                         0, [], [], []>;
 }
 
 defm None : FunctionControlOperand<0x0>;
@@ -1506,7 +1595,9 @@ class MemorySemantics<string name, bits<32> value> {
 
 multiclass MemorySemanticsOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : MemorySemantics<NAME, value>;
-  defm : SymbolicOperandWithRequirements<MemorySemanticsOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<MemorySemanticsOperand, value, NAME,
+                                         minVersion, maxVersion, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm None : MemorySemanticsOperand<0x0, 0, 0, [], []>;
@@ -1544,7 +1635,9 @@ class MemoryOperand<string name, bits<32> value> {
 
 multiclass MemoryOperandOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : MemoryOperand<NAME, value>;
-  defm : SymbolicOperandWithRequirements<MemoryOperandOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<MemoryOperandOperand, value, NAME,
+                                         minVersion, maxVersion, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm None : MemoryOperandOperand<0x0, 0, 0, [], []>;
@@ -1577,7 +1670,9 @@ class Scope<string name, bits<32> value> {
 
 multiclass ScopeOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : Scope<NAME, value>;
-  defm : SymbolicOperandWithRequirements<ScopeOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<ScopeOperand, value, NAME, minVersion,
+                                         maxVersion, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm CrossDevice : ScopeOperand<0, 0, 0, [], []>;
@@ -1607,7 +1702,9 @@ class GroupOperation<string name, bits<32> value> {
 
 multiclass GroupOperationOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def NAME : GroupOperation<NAME, value>;
-  defm : SymbolicOperandWithRequirements<GroupOperationOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<GroupOperationOperand, value, NAME,
+                                         minVersion, maxVersion, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm Reduce : GroupOperationOperand<0, 0, 0, [], [Kernel, GroupNonUniformArithmetic, GroupNonUniformBallot]>;
@@ -1638,7 +1735,9 @@ class KernelEnqueueFlags<string name, bits<32> value> {
 
 multiclass KernelEnqueueFlagsOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : KernelEnqueueFlags<NAME, value>;
-  defm : SymbolicOperandWithRequirements<KernelEnqueueFlagsOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<KernelEnqueueFlagsOperand, value, NAME,
+                                         minVersion, maxVersion, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm NoWait : KernelEnqueueFlagsOperand<0, 0, 0, [], [Kernel]>;
@@ -1665,7 +1764,9 @@ class KernelProfilingInfo<string name, bits<32> value> {
 
 multiclass KernelProfilingInfoOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : KernelProfilingInfo<NAME, value>;
-  defm : SymbolicOperandWithRequirements<KernelProfilingInfoOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<KernelProfilingInfoOperand, value,
+                                         NAME, minVersion, maxVersion,
+                                         reqExtensions, reqCapabilities, []>;
 }
 
 defm None : KernelProfilingInfoOperand<0x0, 0, 0, [], []>;
@@ -1690,7 +1791,8 @@ class Opcode<string name, bits<32> value> {
 
 multiclass OpcodeOperand<bits<32> value> {
   def : Opcode<NAME, value>;
-  defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0, 0, [], []>;
+  defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0,
+                                         0, [], [], []>;
 }
 // TODO: implement other mnemonics.
 defm InBoundsAccessChain : OpcodeOperand<66>;
@@ -1720,7 +1822,9 @@ class CooperativeMatrixLayout<string name, bits<32> value> {
 
 multiclass CooperativeMatrixLayoutOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : CooperativeMatrixLayout<NAME, value>;
-  defm : SymbolicOperandWithRequirements<CooperativeMatrixLayoutOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<CooperativeMatrixLayoutOperand, value,
+                                         NAME, 0, 0, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm RowMajorKHR : CooperativeMatrixLayoutOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>;
@@ -1747,7 +1851,9 @@ class CooperativeMatrixOperands<string name, bits<32> value> {
 
 multiclass CooperativeMatrixOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : CooperativeMatrixOperands<NAME, value>;
-  defm : SymbolicOperandWithRequirements<CooperativeMatrixOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<CooperativeMatrixOperandsOperand,
+                                         value, NAME, 0, 0, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 defm NoneKHR : CooperativeMatrixOperandsOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>;
@@ -1780,7 +1886,9 @@ class SpecConstantOpOperands<string name, bits<32> value> {
 
 multiclass SpecConstantOpOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
   def : SpecConstantOpOperands<NAME, value>;
-  defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+  defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value,
+                                         NAME, 0, 0, reqExtensions,
+                                         reqCapabilities, []>;
 }
 
 // Conversion
@@ -1868,7 +1976,9 @@ class MatrixMultiplyAccumulateOperands<string name, bits<32> value> {
 
 multiclass  MatrixMultiplyAccumulateOperandsOperand<bits<32> value, list<Extension> reqExtensions> {
   def : MatrixMultiplyAccumulateOperands<NAME, value>;
-  defm : SymbolicOperandWithRequirements<MatrixMultiplyAccumulateOperandsOperand, value, NAME, 0, 0, reqExtensions, []>;
+  defm : SymbolicOperandWithRequirements<
+             MatrixMultiplyAccumulateOperandsOperand, value, NAME, 0, 0,
+             reqExtensions, [], []>;
 }
 
 defm None :  MatrixMultiplyAccumulateOperandsOperand<0x0, [SPV_INTEL_subgroup_matrix_multiply_accumulate]>;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index c0fc3a6..dd22132 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1799,12 +1799,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
   setOperationAction(ISD::FREM , MVT::f64, Expand);
-  setOperationAction(ISD::FMA  , MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f64,
+                     Subtarget->isUA2007() ? Legal : Expand);
   setOperationAction(ISD::FSIN , MVT::f32, Expand);
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
-  setOperationAction(ISD::FMA, MVT::f32, Expand);
+  setOperationAction(ISD::FMA, MVT::f32,
+                     Subtarget->isUA2007() ? Legal : Expand);
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
@@ -2278,21 +2280,15 @@ SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
-  ArgListEntry Entry;
-  Entry.Node = Arg;
-  Entry.Ty   = ArgTy;
-
   if (ArgTy->isFP128Ty()) {
     // Create a stack object and pass the pointer to the library function.
     int FI = MFI.CreateStackObject(16, Align(8), false);
     SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
-                         Align(8));
-
-    Entry.Node = FIPtr;
-    Entry.Ty = PointerType::getUnqual(ArgTy->getContext());
+    Chain = DAG.getStore(Chain, DL, Arg, FIPtr, MachinePointerInfo(), Align(8));
+    Args.emplace_back(FIPtr, PointerType::getUnqual(ArgTy->getContext()));
+  } else {
+    Args.emplace_back(Arg, ArgTy);
   }
-  Args.push_back(Entry);
   return Chain;
 }
 
@@ -2314,11 +2310,9 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
 
   if (RetTy->isFP128Ty()) {
     // Create a Stack Object to receive the return value of type f128.
-    ArgListEntry Entry;
     int RetFI = MFI.CreateStackObject(16, Align(8), false);
     RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
-    Entry.Node = RetPtr;
-    Entry.Ty = PointerType::getUnqual(RetTy->getContext());
+    ArgListEntry Entry(RetPtr, PointerType::getUnqual(RetTy->getContext()));
     if (!Subtarget->is64Bit()) {
       Entry.IsSRet = true;
       Entry.IndirectType = RetTy;
@@ -3550,6 +3544,11 @@ bool SparcTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
   return isCheapToSpeculateCtlz(Ty);
 }
 
+bool SparcTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                                     EVT VT) const {
+  return Subtarget->isUA2007() && !Subtarget->useSoftFloat();
+}
+
 // Override to disable global variable loading on Linux.
 void SparcTargetLowering::insertSSPDeclarations(Module &M) const {
   if (!Subtarget->isTargetLinux())
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
index 0d220f8..7fffb7c 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -28,6 +28,8 @@ namespace llvm {
 
     bool useSoftFloat() const override;
 
+    bool softPromoteHalfType() const override { return true; }
+
     /// computeKnownBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
@@ -177,6 +179,11 @@ namespace llvm {
 
     bool isCheapToSpeculateCttz(Type *Ty) const override;
 
+    bool enableAggressiveFMAFusion(EVT VT) const override { return true; };
+
+    bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                    EVT VT) const override;
+
     bool shouldInsertFencesForAtomic(const Instruction *I) const override {
       // FIXME: We insert fences for each atomics and generate
       // sub-optimal code for PSO/TSO. (Approximately nobody uses any
diff --git a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
index 3a30e55..ffd4423 100644
--- a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
+++ b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
@@ -66,3 +66,15 @@ defm CXBCOND : F2_56<"cxb", 1>;
 def FPMADDX   : FourOp<"fpmaddx", 0b110111, 0b0000, DFPRegs>;
 def FPMADDXHI : FourOp<"fpmaddxhi", 0b110111, 0b0100, DFPRegs>;
 } // Predicates = [HasOSA2011]
+
+// UA2007 instruction patterns.
+let Predicates = [HasUA2007] in {
+def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, f32:$add)), (FMADDS $rs1, $rs2, $add)>;
+def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, f64:$add)), (FMADDD $rs1, $rs2, $add)>;
+def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub))), (FMSUBS $rs1, $rs2, $sub)>;
+def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub))), (FMSUBD $rs1, $rs2, $sub)>;
+def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, f32:$add))), (FNMADDS $rs1, $rs2, $add)>;
+def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, f64:$add))), (FNMADDD $rs1, $rs2, $add)>;
+def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub)))), (FNMSUBS $rs1, $rs2, $sub)>;
+def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub)))), (FNMSUBD $rs1, $rs2, $sub)>;
+} // Predicates = [HasUA2007]
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index fbb98ff..f5ffbf5 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -29,50 +29,6 @@ namespace SystemZ {
   extern const MCPhysReg XPLINK64ArgFPRs[XPLINK64NumArgFPRs];
 } // end namespace SystemZ
 
-class SystemZCCState : public CCState {
-private:
-  /// Records whether the value was widened from a short vector type.
-  SmallVector<bool, 4> ArgIsShortVector;
-
-  // Check whether ArgVT is a short vector type.
-  bool IsShortVectorType(EVT ArgVT) {
-    return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
-  }
-
-public:
-  SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-                 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
-      : CCState(CC, isVarArg, MF, locs, C) {}
-
-  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
-                              CCAssignFn Fn) {
-    // Record whether the call operand was a short vector.
-    ArgIsShortVector.clear();
-    for (unsigned i = 0; i < Ins.size(); ++i)
-      ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));
-
-    CCState::AnalyzeFormalArguments(Ins, Fn);
-  }
-
-  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                           CCAssignFn Fn) {
-    // Record whether the call operand was a short vector.
-    ArgIsShortVector.clear();
-    for (unsigned i = 0; i < Outs.size(); ++i)
-      ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));
-
-    CCState::AnalyzeCallOperands(Outs, Fn);
-  }
-
-  // This version of AnalyzeCallOperands in the base class is not usable
-  // since we must provide a means of accessing ISD::OutputArg::IsShortVector.
-  void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
-                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
-                           CCAssignFn Fn) = delete;
-
-  bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
-};
-
 // Handle i128 argument types.  These need to be passed by implicit
 // reference.  This could be as simple as the following .td line:
 //    CCIfType<[i128], CCPassIndirect<i64>>,
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 059f31f..2795de5 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -18,7 +18,7 @@ class CCIfSubtarget<string F, CCAction A>
 
 // Match if this specific argument was widened from a short vector type.
 class CCIfShortVector<CCAction A>
-    : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
+    : CCIf<"OrigTy->isVectorTy() && OrigTy->getPrimitiveSizeInBits() <= 64", A>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 5ee66e3..dcefff9 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index fb0a47d..c73dc30 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1941,7 +1941,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
   FuncInfo->setSizeOfFnParams(CCInfo.getStackSize());
 
@@ -2251,7 +2251,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, Ctx);
+  CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, Ctx);
   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
   // We don't support GuaranteedTailCallOpt, only automatically-detected
@@ -2460,10 +2460,9 @@ std::pair<SDValue, SDValue> SystemZTargetLowering::makeExternalCall(
   TargetLowering::ArgListTy Args;
   Args.reserve(Ops.size());
 
-  TargetLowering::ArgListEntry Entry;
   for (SDValue Op : Ops) {
-    Entry.Node = Op;
-    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    TargetLowering::ArgListEntry Entry(
+        Op, Op.getValueType().getTypeForEVT(*DAG.getContext()));
     Entry.IsSExt = shouldSignExtendTypeInLibCall(Entry.Ty, IsSigned);
     Entry.IsZExt = !Entry.IsSExt;
     Args.push_back(Entry);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 1866962..707887c 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -523,7 +523,7 @@ public:
                             bool MathUsed) const override {
     // Form add and sub with overflow intrinsics regardless of any extra
     // users of the math result.
-    return VT == MVT::i32 || VT == MVT::i64;
+    return VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i128;
   }
 
   bool shouldConsiderGEPOffsetSplit() const override { return true; }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index f32c9bd..2611c29 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -436,20 +436,6 @@ bool SystemZTTIImpl::isLSRCostLess(
              C2.ScaleCost, C2.SetupCost);
 }
 
-bool SystemZTTIImpl::areInlineCompatible(const Function *Caller,
-                                         const Function *Callee) const {
-  const TargetMachine &TM = getTLI()->getTargetMachine();
-
-  const FeatureBitset &CallerBits =
-      TM.getSubtargetImpl(*Caller)->getFeatureBits();
-  const FeatureBitset &CalleeBits =
-      TM.getSubtargetImpl(*Callee)->getFeatureBits();
-
-  // Support only equal feature bitsets. Restriction should be relaxed in the
-  // future to allow inlining when callee's bits are subset of the caller's.
-  return CallerBits == CalleeBits;
-}
-
 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
   bool Vector = (ClassID == 1);
   if (!Vector)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index dc5736e..fc681de 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -65,9 +65,6 @@ public:
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                      const TargetTransformInfo::LSRCost &C2) const override;
 
-  bool areInlineCompatible(const Function *Caller,
-                           const Function *Callee) const override;
-
   /// @}
 
   /// \name Vector TTI Implementations
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 9b03e85..28495e7 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -151,17 +151,17 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
   SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
   M.getModuleFlagsMetadata(ModuleFlags);
 
-  MDNode *CFGProfile = nullptr;
+  MDNode *CGProfile = nullptr;
 
   for (const auto &MFE : ModuleFlags) {
     StringRef Key = MFE.Key->getString();
     if (Key == "CG Profile") {
-      CFGProfile = cast<MDNode>(MFE.Val);
+      CGProfile = cast<MDNode>(MFE.Val);
       break;
     }
   }
 
-  if (!CFGProfile)
+  if (!CGProfile)
     return;
 
   auto GetSym = [this](const MDOperand &MDO) -> MCSymbol * {
@@ -174,7 +174,7 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
     return TM->getSymbol(F);
   };
 
-  for (const auto &Edge : CFGProfile->operands()) {
+  for (const auto &Edge : CGProfile->operands()) {
     MDNode *E = cast<MDNode>(Edge);
     const MCSymbol *From = GetSym(E->getOperand(0));
     const MCSymbol *To = GetSym(E->getOperand(1));
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 9e8f400..2cfdc75 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -1649,14 +1649,11 @@ SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
 
   // Prepare arguments
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Node = Size;
-  Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-  Args.push_back(Entry);
+  Args.emplace_back(Size, Size.getValueType().getTypeForEVT(*DAG.getContext()));
   if (NeedsAlign) {
-    Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
-    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Args.push_back(Entry);
+    SDValue Align = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
+    Args.emplace_back(Align,
+                      Align.getValueType().getTypeForEVT(*DAG.getContext()));
   }
   Type *RetTy = Type::getVoidTy(*DAG.getContext());
 
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 80df4ed..45bbf12 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -220,7 +220,6 @@ static MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx,
     Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name));
     Sym->setFunctionTable(Is64);
     // The default function table is synthesized by the linker.
-    Sym->setUndefined();
   }
   return Sym;
 }
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 2a398d4..fa6086c 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -26,7 +26,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index ec95e86..2666342 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -912,6 +912,8 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
 
   if (!IsVoid)
     updateValueMap(Call, ResultReg);
+
+  diagnoseDontCall(*Call);
   return true;
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f9eba4b..35d5c3e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1320,18 +1320,21 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // signature They are necessary to match callee and caller signature for
   // indirect call.
   if (CallConv == CallingConv::Swift) {
+    Type *PtrTy = PointerType::getUnqual(*DAG.getContext());
     if (!HasSwiftSelfArg) {
       NumFixedArgs++;
-      ISD::OutputArg Arg;
-      Arg.Flags.setSwiftSelf();
+      ISD::ArgFlagsTy Flags;
+      Flags.setSwiftSelf();
+      ISD::OutputArg Arg(Flags, PtrVT, EVT(PtrVT), PtrTy, 0, 0);
       CLI.Outs.push_back(Arg);
       SDValue ArgVal = DAG.getUNDEF(PtrVT);
       CLI.OutVals.push_back(ArgVal);
     }
     if (!HasSwiftErrorArg) {
       NumFixedArgs++;
-      ISD::OutputArg Arg;
-      Arg.Flags.setSwiftError();
+      ISD::ArgFlagsTy Flags;
+      Flags.setSwiftError();
+      ISD::OutputArg Arg(Flags, PtrVT, EVT(PtrVT), PtrTy, 0, 0);
       CLI.Outs.push_back(Arg);
       SDValue ArgVal = DAG.getUNDEF(PtrVT);
       CLI.OutVals.push_back(ArgVal);
@@ -3383,8 +3386,56 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) {
   return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0));
 }
 
+/// Try to convert a i128 comparison to a v16i8 comparison before type
+/// legalization splits it up into chunks
+static SDValue
+combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const WebAssemblySubtarget *Subtarget) {
+
+  SDLoc DL(N);
+  SDValue X = N->getOperand(0);
+  SDValue Y = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = X.getValueType();
+
+  SelectionDAG &DAG = DCI.DAG;
+  if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute(
+          Attribute::NoImplicitFloat))
+    return SDValue();
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  // We're looking for an oversized integer equality comparison with SIMD
+  if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 ||
+      !Subtarget->hasSIMD128() || !isIntEqualitySetCC(CC))
+    return SDValue();
+
+  // Don't perform this combine if constructing the vector will be expensive.
+  auto IsVectorBitCastCheap = [](SDValue X) {
+    X = peekThroughBitcasts(X);
+    return isa<ConstantSDNode>(X) || X.getOpcode() == ISD::LOAD;
+  };
+
+  if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
+    return SDValue();
+
+  SDValue VecX = DAG.getBitcast(MVT::v16i8, X);
+  SDValue VecY = DAG.getBitcast(MVT::v16i8, Y);
+  SDValue Cmp = DAG.getSetCC(DL, MVT::v16i8, VecX, VecY, CC);
+
+  SDValue Intr =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+                  {DAG.getConstant(CC == ISD::SETEQ ? Intrinsic::wasm_alltrue
+                                                    : Intrinsic::wasm_anytrue,
+                                   DL, MVT::i32),
+                   Cmp});
+
+  return DAG.getSetCC(DL, VT, Intr, DAG.getConstant(0, DL, MVT::i32),
+                      ISD::SETNE);
+}
+
 static SDValue performSETCCCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const WebAssemblySubtarget *Subtarget) {
   if (!DCI.isBeforeLegalize())
     return SDValue();
 
@@ -3392,6 +3443,9 @@ static SDValue performSETCCCombine(SDNode *N,
   if (!VT.isScalarInteger())
     return SDValue();
 
+  if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget))
+    return V;
+
   SDValue LHS = N->getOperand(0);
   if (LHS->getOpcode() != ISD::BITCAST)
     return SDValue();
@@ -3571,7 +3625,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::BITCAST:
     return performBitcastCombine(N, DCI);
   case ISD::SETCC:
-    return performSETCCCombine(N, DCI);
+    return performSETCCCombine(N, DCI, Subtarget);
   case ISD::VECTOR_SHUFFLE:
     return performVECTOR_SHUFFLECombine(N, DCI);
   case ISD::SIGN_EXTEND:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 4548a75..45b0e7d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -533,8 +533,8 @@ struct StaticLibcallNameMap {
     // different libcalls.
     RTLIB::RuntimeLibcallsInfo RTCI(TT);
     for (RTLIB::Libcall LC : RTLIB::libcalls()) {
-      const char *NameLibcall = RTCI.getLibcallName(LC);
-      if (NameLibcall != nullptr &&
+      StringRef NameLibcall = RTCI.getLibcallName(LC);
+      if (!NameLibcall.empty() &&
           getRuntimeLibcallSignatures().Table[LC] != unsupported) {
         assert(!Map.contains(NameLibcall) &&
                "duplicate libcall names in name map");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 52e7065..08fb758 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -147,7 +147,8 @@ WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
 
   Options.AllowOverlappingLoads = true;
 
-  // TODO: Teach WebAssembly backend about load v128.
+  if (ST->hasSIMD128())
+    Options.LoadSizes.push_back(16);
 
   Options.LoadSizes.append({8, 4, 2, 1});
   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index 42d1271..8904867 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -113,7 +113,6 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol(
     Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name));
     Sym->setFunctionTable(is64);
     // The default function table is synthesized by the linker.
-    Sym->setUndefined();
   }
   // MVP object files can't have symtab entries for tables.
   if (!(Subtarget && Subtarget->hasCallIndirectOverlong()))
diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
index d9f4405..c0b9339 100644
--- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
@@ -69,7 +69,7 @@ public:
                  CCValAssign::LocInfo LocInfo,
                  const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
                  CCState &State) override {
-    bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+    bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State);
     StackSize = State.getStackSize();
 
     static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
index 817e88d..e2a1bbf3 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
@@ -36,11 +36,31 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
   }
 }
 
+void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst,
+                                         const MCInst &MCI) {
+  // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we
+  // have not done the necessary benchmarking to see if they are also
+  // optimized by the stack engine.
+  // TODO: We currently just remove all RSP writes from stack operations. This
+  // is not fully correct because we do not model sync uops which will
+  // delay subsequent rsp using non-stack instructions.
+  if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) {
+    auto *StackRegisterDef =
+        llvm::find_if(Inst->getDefs(), [](const WriteState &State) {
+          return State.getRegisterID() == X86::RSP;
+        });
+    assert(
+        StackRegisterDef != Inst->getDefs().end() &&
+        "Expected push instruction to implicitly use stack pointer register.");
+    Inst->getDefs().erase(StackRegisterDef);
+  }
+}
+
 void X86InstrPostProcess::postProcessInstruction(
     std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
-  // Currently, we only modify certain instructions' IsALoadBarrier and
-  // IsAStoreBarrier flags.
+  // Set IsALoadBarrier and IsAStoreBarrier flags.
   setMemBarriers(Inst, MCI);
+  useStackEngine(Inst, MCI);
 }
 
 } // namespace mca
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index 4a83ba8..c5459e4 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess {
   /// as load and store barriers.
   void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
 
+  /// Called within X86InstrPostPorcess to remove some rsp read operands
+  /// on stack instructions to better simulate the stack engine. We currently
+  /// do not model features of the stack engine like sync uops.
+  void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
 public:
   X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 990b381..3d34ea3 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1291,7 +1291,9 @@ def ProcessorFeatures {
   list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps,
                                                 TuningPreferMovmskOverVTest,
                                                 TuningFastImmVectorShift];
-  list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
+  list<SubtargetFeature> ADLRemoveTuning = [TuningPOPCNTFalseDeps];
+  list<SubtargetFeature> ADLTuning =
+      !listremove(!listconcat(SKLTuning, ADLAdditionalTuning), ADLRemoveTuning);
   list<SubtargetFeature> ADLFeatures =
     !listconcat(TRMFeatures, ADLAdditionalFeatures);
 
diff --git a/llvm/lib/Target/X86/X86CallingConv.h b/llvm/lib/Target/X86/X86CallingConv.h
index 191e0fa..8e37f34 100644
--- a/llvm/lib/Target/X86/X86CallingConv.h
+++ b/llvm/lib/Target/X86/X86CallingConv.h
@@ -22,10 +22,10 @@ namespace llvm {
 
 bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT,
                CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-               CCState &State);
+               Type *OrigTy, CCState &State);
 
 bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
-            ISD::ArgFlagsTy ArgFlags, CCState &State);
+            ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State);
 
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 067bd43..f007886 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3323,6 +3323,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     return false;
 
   SmallVector<MVT, 16> OutVTs;
+  SmallVector<Type *, 16> ArgTys;
   SmallVector<Register, 16> ArgRegs;
 
   // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
@@ -3369,6 +3370,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
     ArgRegs.push_back(ResultReg);
     OutVTs.push_back(VT);
+    ArgTys.push_back(Val->getType());
   }
 
   // Analyze operands of the call, assigning locations to each operand.
@@ -3379,7 +3381,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (IsWin64)
     CCInfo.AllocateStack(32, Align(8));
 
-  CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
+  CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, ArgTys, CC_X86);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 95ed590..cba7843 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/EHPersonalities.h"
@@ -2678,7 +2679,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
   // object.
   // We need to factor in additional offsets applied during the prologue to the
   // frame, base, and stack pointer depending on which is used.
-  int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+  int64_t Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
   const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t StackSize = MFI.getStackSize();
@@ -4212,6 +4213,14 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
   // emitPrologue if it gets called and emits CFI.
   MF.setHasWinCFI(false);
 
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  // If the frame is big enough that we might need to scavenge a register to
+  // handle huge offsets, reserve a stack slot for that now.
+  if (!isInt<32>(MFI.estimateStackSize(MF))) {
+    int FI = MFI.CreateStackObject(SlotSize, Align(SlotSize), false);
+    RS->addScavengingFrameIndex(FI);
+  }
+
   // If we are using Windows x64 CFI, ensure that the stack is always 8 byte
   // aligned. The format doesn't support misaligned stack adjustments.
   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f366094..8c3380b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2756,8 +2756,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
       !Subtarget.hasBWI())
     return TypeSplitVector;
 
+  // Since v8f16 is legal, widen anything over v4f16.
   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
-      !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
+      VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
+      VT.getVectorElementType() == MVT::f16)
     return TypeSplitVector;
 
   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
@@ -15419,18 +15421,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
         return SDValue();
     }
 
-    // Avoid returning the same shuffle operation. For example,
-    // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
-    //                             undef:v16i16
-    if (CrossLaneMask == Mask || InLaneMask == Mask)
-      return SDValue();
-
     // Simplify CrossLaneMask based on the actual demanded elements.
     if (V1.hasOneUse())
       for (int i = 0; i != NumElts; ++i)
         if (!DemandedCrossLane[i])
           CrossLaneMask[i] = SM_SentinelUndef;
 
+    // Avoid returning the same shuffle operation. For example,
+    // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
+    //                             undef:v16i16
+    if (CrossLaneMask == Mask || InLaneMask == Mask)
+      return SDValue();
+
     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
                                 InLaneMask);
@@ -22219,9 +22221,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 
       In = DAG.getBitcast(MVT::i16, In);
       TargetLowering::ArgListTy Args;
-      TargetLowering::ArgListEntry Entry;
-      Entry.Node = In;
-      Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
+      TargetLowering::ArgListEntry Entry(
+          In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
       Entry.IsSExt = false;
       Entry.IsZExt = true;
       Args.push_back(Entry);
@@ -22318,9 +22319,8 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
     Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
 
     TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = In;
-    Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
+    TargetLowering::ArgListEntry Entry(
+        In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
     Entry.IsSExt = false;
     Entry.IsZExt = true;
     Args.push_back(Entry);
@@ -30049,7 +30049,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   SDValue InChain = DAG.getEntryNode();
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
     EVT ArgVT = Op->getOperand(i).getValueType();
     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
@@ -30058,13 +30057,9 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
     int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
     MachinePointerInfo MPI =
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
-    Entry.Node = StackPtr;
     InChain =
         DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
-    Entry.Ty = PointerType::get(*DAG.getContext(), 0);
-    Entry.IsSExt = false;
-    Entry.IsZExt = false;
-    Args.push_back(Entry);
+    Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
   }
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
@@ -33087,13 +33082,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-
-  Entry.Node = Arg;
-  Entry.Ty = ArgTy;
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
-  Args.push_back(Entry);
+  Args.emplace_back(Arg, ArgTy);
 
   bool isF64 = ArgVT == MVT::f64;
   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
@@ -45163,6 +45152,9 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   case X86ISD::PCMPEQ:
   case X86ISD::PCMPGT:
     return false;
+  // SSE signbit extraction.
+  case X86ISD::MOVMSK:
+    return false;
   case ISD::INTRINSIC_WO_CHAIN:
     switch (Op->getConstantOperandVal(0)) {
     case Intrinsic::x86_sse2_pmadd_wd:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 547b221..3dd79b3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1663,8 +1663,8 @@ namespace llvm {
     /// instructions/intrinsics.
     bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
-                              ArrayRef<unsigned> Indices,
-                              unsigned Factor) const override;
+                              ArrayRef<unsigned> Indices, unsigned Factor,
+                              const APInt &GapMask) const override;
 
     /// Lower interleaved store(s) into target specific
     /// instructions/intrinsics.
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 636b072..632db7e 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 bool X86TargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -812,7 +812,7 @@ bool X86TargetLowering::lowerInterleavedLoad(
   auto *LI = dyn_cast<LoadInst>(Load);
   if (!LI)
     return false;
-  assert(!Mask && "Unexpected mask on a load");
+  assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
 
   // Create an interleaved access group.
   IRBuilder<> Builder(LI);
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index cf055cf..090060e 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -491,7 +491,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
   NumGadgets += GadgetCount;
 
   // Traverse CFG to build the rest of the graph
-  SmallSet<MachineBasicBlock *, 8> BlocksVisited;
+  SmallPtrSet<MachineBasicBlock *, 8> BlocksVisited;
   std::function<void(MachineBasicBlock *, GraphIter, unsigned)> TraverseCFG =
       [&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) {
         unsigned LoopDepth = MLI.getLoopDepth(MBB);
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 3b4e531..2a1c499 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -100,7 +100,7 @@ struct BBInfo {
 class X86PreTileConfig : public MachineFunctionPass {
   MachineRegisterInfo *MRI = nullptr;
   const MachineLoopInfo *MLI = nullptr;
-  SmallSet<MachineInstr *, 8> DefVisited;
+  SmallPtrSet<MachineInstr *, 8> DefVisited;
   DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
   DenseMap<MachineBasicBlock *, SmallVector<MIRef, 8>> ShapeBBs;
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 83b11ee..595ad32 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -21,8 +21,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TileShapeInfo.h"
@@ -907,7 +907,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
   // Determine base register and offset.
-  int FIOffset;
+  int64_t FIOffset;
   Register BasePtr;
   if (MI.isReturn()) {
     assert((!hasStackRealignment(MF) ||
@@ -958,11 +958,41 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 
   if (MI.getOperand(FIOperandNum+3).isImm()) {
-    // Offset is a 32-bit integer.
-    int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
-    int Offset = FIOffset + Imm;
-    assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
-           "Requesting 64-bit offset in 32-bit immediate!");
+    const X86InstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+    int64_t Imm = MI.getOperand(FIOperandNum + 3).getImm();
+    int64_t Offset = FIOffset + Imm;
+    bool FitsIn32Bits = isInt<32>(Offset);
+    // If the offset will not fit in a 32-bit displacement, then for 64-bit
+    // targets, scavenge a register to hold it. Otherwise...
+    if (Is64Bit && !FitsIn32Bits) {
+      assert(RS && "RegisterScavenger was NULL");
+
+      RS->enterBasicBlockEnd(MBB);
+      RS->backward(std::next(II));
+
+      Register ScratchReg = RS->scavengeRegisterBackwards(
+          X86::GR64RegClass, II, /*RestoreAfter=*/false, /*SPAdj=*/0,
+          /*AllowSpill=*/true);
+      assert(ScratchReg != 0 && "scratch reg was 0");
+      RS->setRegUsed(ScratchReg);
+
+      BuildMI(MBB, II, DL, TII->get(X86::MOV64ri), ScratchReg).addImm(Offset);
+
+      MI.getOperand(FIOperandNum + 3).setImm(0);
+      MI.getOperand(FIOperandNum + 2).setReg(ScratchReg);
+
+      return false;
+    }
+
+    // ... for 32-bit targets, this is a bug!
+    if (!Is64Bit && !FitsIn32Bits) {
+      MI.emitGenericError("64-bit offset calculated but target is 32-bit");
+      // Trap so that the instruction verification pass does not fail if run.
+      BuildMI(MBB, MBBI, DL, TII->get(X86::TRAP));
+      return false;
+    }
+
     if (Offset != 0 || !tryOptimizeLEAtoMOV(II))
       MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
   } else {
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 19b409a..2f4c55c 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
 #define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
 
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
 #define GET_REGINFO_HEADER
@@ -180,6 +181,10 @@ public:
   constrainRegClassToNonRex2(const TargetRegisterClass *RC) const;
 
   bool isNonRex2RegClass(const TargetRegisterClass *RC) const;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 8cd52e2..f15a7c7 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -70,6 +70,12 @@ def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4,
   let BufferSize=60;
 }
 
+// Skylake can retire up to four (potentially fused) uops per cycle. Set the
+// limit to twice that given we do not model fused uops as only taking up one
+// retirement slot. I could not find any documented sources on how many
+// in-flight micro-ops can be tracked.
+def SKRCU : RetireControlUnit<0, 8>;
+
 // Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
 // cycles after the memory operand.
 def : ReadAdvance<ReadAfterLd, 5>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 14a51d1e..2a793d0 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -70,6 +70,12 @@ def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4,
   let BufferSize=60;
 }
 
+// Skylake can retire up to four (potentially fused) uops per cycle. Set the
+// limit to twice that given we do not model fused uops as only taking up one
+// retirement slot. I could not find any documented sources on how many
+// in-flight micro-ops can be tracked.
+def SKXRCU : RetireControlUnit<0, 8>;
+
 // Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
 // cycles after the memory operand.
 def : ReadAdvance<ReadAfterLd, 5>;
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index c92bc97..133c1a4 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -562,14 +562,7 @@ def AtomWrite0_1_7_4 : SchedWriteRes<[AtomPort0,AtomPort1]> {
   let ReleaseAtCycles = [8,8];
   let NumMicroOps = 4;
 }
-def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrr(_Int)?")>;
-
-def AtomWrite0_1_8_4 : SchedWriteRes<[AtomPort0,AtomPort1]> {
-  let Latency = 8;
-  let ReleaseAtCycles = [8,8];
-  let NumMicroOps = 4;
-}
-def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrm(_Int)?")>;
+def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSr(r|m)(_Int)?")>;
 
 def AtomWrite0_1_9 : SchedWriteRes<[AtomPort0,AtomPort1]> {
   let Latency = 9;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 90791fc..62f9527 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -161,19 +161,26 @@ std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
 }
 
+enum ClassIDEnum { GPRClass = 0, VectorClass = 1, ScalarFPClass = 2 };
+
+unsigned X86TTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
+  return Vector                          ? VectorClass
+         : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
+                                         : GPRClass;
+}
+
 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
-  bool Vector = (ClassID == 1);
-  if (Vector && !ST->hasSSE1())
+  if (ClassID == VectorClass && !ST->hasSSE1())
     return 0;
 
-  if (ST->is64Bit()) {
-    if (Vector && ST->hasAVX512())
-      return 32;
-    if (!Vector && ST->hasEGPR())
-      return 32;
-    return 16;
-  }
-  return 8;
+  if (!ST->is64Bit())
+    return 8;
+
+  if ((ClassID == GPRClass && ST->hasEGPR()) ||
+      (ClassID != GPRClass && ST->hasAVX512()))
+    return 32;
+
+  return 16;
 }
 
 bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const {
@@ -5488,9 +5495,10 @@ InstructionCost X86TTIImpl::getPointersChainCost(
   return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
 }
 
-InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
-                                                      ScalarEvolution *SE,
-                                                      const SCEV *Ptr) const {
+InstructionCost
+X86TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                      const SCEV *Ptr,
+                                      TTI::TargetCostKind CostKind) const {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -5504,7 +5512,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
   // Even in the case of (loop invariant) stride whose value is not known at
   // compile time, the address computation will not incur more than one extra
   // ADD instruction.
-  if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
+  if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
     // TODO: AVX2 is the current cut-off because we don't have correct
     //       interleaving costs for prior ISA's.
     if (!BaseT::isStridedAccess(Ptr))
@@ -5513,7 +5521,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
       return 1;
   }
 
-  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
+  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
 }
 
 InstructionCost
@@ -6525,8 +6533,8 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
 
   for (const Instruction &I : instructions(Callee)) {
     if (const auto *CB = dyn_cast<CallBase>(&I)) {
-      // Having more target features is fine for inline ASM.
-      if (CB->isInlineAsm())
+      // Having more target features is fine for inline ASM and intrinsics.
+      if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
         continue;
 
       SmallVector<Type *, 8> Types;
@@ -6542,19 +6550,9 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
       if (all_of(Types, IsSimpleTy))
         continue;
 
-      if (Function *NestedCallee = CB->getCalledFunction()) {
-        // Assume that intrinsics are always ABI compatible.
-        if (NestedCallee->isIntrinsic())
-          continue;
-
-        // Do a precise compatibility check.
-        if (!areTypesABICompatible(Caller, NestedCallee, Types))
-          return false;
-      } else {
-        // We don't know the target features of the callee,
-        // assume it is incompatible.
+      // Do a precise compatibility check.
+      if (!areTypesABICompatible(Caller, Callee, Types))
         return false;
-      }
     }
   }
   return true;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index bc06c47..133b366 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,6 +132,7 @@ public:
   /// @{
 
   unsigned getNumberOfRegisters(unsigned ClassID) const override;
+  unsigned getRegisterClassForType(bool Vector, Type *Ty) const override;
   bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override;
   TypeSize
   getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override;
@@ -194,8 +195,9 @@ public:
   getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base,
                        const TTI::PointersChainInfo &Info, Type *AccessTy,
                        TTI::TargetCostKind CostKind) const override;
-  InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
-                                            const SCEV *Ptr) const override;
+  InstructionCost
+  getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+                            TTI::TargetCostKind CostKind) const override;
 
   std::optional<Instruction *>
   instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
index e9081a4..ea8b88f 100644
--- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
+++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
@@ -190,6 +190,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
         State = FunctionState::FinishedEpilog;
         break;
 
+      case X86::LEA64r:
       case X86::MOV64rr:
       case X86::ADD64ri32:
         if (State == FunctionState::InEpilog) {
@@ -201,51 +202,56 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
                 "The epilog is deallocating a stack "
                 "allocation, but the prolog did "
                 "not allocate one");
-          if (HasStackDealloc)
+          if (PoppedRegCount > 0)
             return rejectCurrentFunctionInternalError(
                 MF, Mode,
-                "The epilog is deallocating the stack "
-                "allocation more than once");
-          if (PoppedRegCount > 0)
-            llvm_unreachable(
-                "Should have raised an error: either popping before "
-                "deallocating or deallocating without an allocation");
+                "The epilog is deallocating a stack allocation after popping "
+                "registers");
 
           HasStackDealloc = true;
         } else if (State == FunctionState::FinishedEpilog)
           return rejectCurrentFunctionInternalError(
-              MF, Mode, "Unexpected mov or add instruction after the epilog");
+              MF, Mode,
+              "Unexpected lea, mov or add instruction after the epilog");
         break;
 
       case X86::POP64r:
         if (State == FunctionState::InEpilog) {
-          // After the stack pointer has been adjusted, the epilog must
-          // POP each register in reverse order of the PUSHes in the prolog.
-          PoppedRegCount++;
-          if (HasStackAlloc != HasStackDealloc)
-            return rejectCurrentFunctionInternalError(
-                MF, Mode,
-                "Cannot pop registers before the stack "
-                "allocation has been deallocated");
-          if (PoppedRegCount > PushedRegs.size())
-            return rejectCurrentFunctionInternalError(
-                MF, Mode,
-                "The epilog is popping more registers than the prolog pushed");
-          if (PushedRegs[PushedRegs.size() - PoppedRegCount] !=
-              MI.getOperand(0).getReg())
-            return rejectCurrentFunctionInternalError(
-                MF, Mode,
-                "The epilog is popping a registers in "
-                "a different order than the "
-                "prolog pushed them");
-
-          // Unwind v2 records the size of the epilog not from where we place
-          // SEH_BeginEpilogue (as that contains the instruction to adjust the
-          // stack pointer) but from the first POP instruction (if there is
-          // one).
-          if (!UnwindV2StartLocation) {
-            assert(PoppedRegCount == 1);
-            UnwindV2StartLocation = &MI;
+          Register Reg = MI.getOperand(0).getReg();
+          if (HasStackAlloc && (PoppedRegCount == 0) &&
+              !llvm::is_contained(PushedRegs, Reg)) {
+            // If this is a pop that doesn't correspond to the set of pushed
+            // registers, then assume it was used to adjust the stack pointer.
+            HasStackDealloc = true;
+          } else {
+            // After the stack pointer has been adjusted, the epilog must
+            // POP each register in reverse order of the PUSHes in the prolog.
+            PoppedRegCount++;
+            if (HasStackAlloc != HasStackDealloc)
+              return rejectCurrentFunctionInternalError(
+                  MF, Mode,
+                  "Cannot pop registers before the stack "
+                  "allocation has been deallocated");
+            if (PoppedRegCount > PushedRegs.size())
+              return rejectCurrentFunctionInternalError(
+                  MF, Mode,
+                  "The epilog is popping more registers than the prolog "
+                  "pushed");
+            if (PushedRegs[PushedRegs.size() - PoppedRegCount] != Reg.id())
+              return rejectCurrentFunctionInternalError(
+                  MF, Mode,
+                  "The epilog is popping a registers in "
+                  "a different order than the "
+                  "prolog pushed them");
+
+            // Unwind v2 records the size of the epilog not from where we place
+            // SEH_BeginEpilogue (as that contains the instruction to adjust the
+            // stack pointer) but from the first POP instruction (if there is
+            // one).
+            if (!UnwindV2StartLocation) {
+              assert(PoppedRegCount == 1);
+              UnwindV2StartLocation = &MI;
+            }
           }
         } else if (State == FunctionState::FinishedEpilog)
           // Unexpected instruction after the epilog.
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index ef4cfcd..0a96ab2 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -429,11 +429,7 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   // Lower to a call to __misaligned_load(BasePtr).
   Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context);
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-
-  Entry.Ty = IntPtrTy;
-  Entry.Node = BasePtr;
-  Args.push_back(Entry);
+  Args.emplace_back(BasePtr, IntPtrTy);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
@@ -480,14 +476,8 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   // Lower to a call to __misaligned_store(BasePtr, Value).
   Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context);
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-
-  Entry.Ty = IntPtrTy;
-  Entry.Node = BasePtr;
-  Args.push_back(Entry);
-
-  Entry.Node = Value;
-  Args.push_back(Entry);
+  Args.emplace_back(BasePtr, IntPtrTy);
+  Args.emplace_back(Value, IntPtrTy);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(Chain).setCallee(
diff --git a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 1bd92a2..f61115e 100644
--- a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -33,11 +33,10 @@ SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
       DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) {
     const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
     TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-    Entry.Node = Dst; Args.push_back(Entry);
-    Entry.Node = Src; Args.push_back(Entry);
-    Entry.Node = Size; Args.push_back(Entry);
+    Type *ArgTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+    Args.emplace_back(Dst, ArgTy);
+    Args.emplace_back(Src, ArgTy);
+    Args.emplace_back(Size, ArgTy);
 
     const char *MemcpyAlign4Name = TLI.getLibcallName(RTLIB::MEMCPY_ALIGN_4);
     CallingConv::ID CC = TLI.getLibcallCallingConv(RTLIB::MEMCPY_ALIGN_4);
diff --git a/llvm/lib/Target/Xtensa/Xtensa.td b/llvm/lib/Target/Xtensa/Xtensa.td
index 2c4bacb..4ef885e1 100644
--- a/llvm/lib/Target/Xtensa/Xtensa.td
+++ b/llvm/lib/Target/Xtensa/Xtensa.td
@@ -23,10 +23,8 @@ include "XtensaFeatures.td"
 //===----------------------------------------------------------------------===//
 // Xtensa supported processors.
 //===----------------------------------------------------------------------===//
-class Proc<string Name, list<SubtargetFeature> Features>
-    : Processor<Name, NoItineraries, Features>;
 
-def : Proc<"generic", []>;
+include "XtensaProcessors.td"
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index 6a07bd8..f136703 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -353,7 +353,8 @@ static const MCPhysReg IntRegs[] = {Xtensa::A2, Xtensa::A3, Xtensa::A4,
 
 static bool CC_Xtensa_Custom(unsigned ValNo, MVT ValVT, MVT LocVT,
                              CCValAssign::LocInfo LocInfo,
-                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
+                             ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                             CCState &State) {
   if (ArgFlags.isByVal()) {
     Align ByValAlign = ArgFlags.getNonZeroByValAlign();
     unsigned ByValSize = ArgFlags.getByValSize();
diff --git a/llvm/lib/Target/Xtensa/XtensaProcessors.td b/llvm/lib/Target/Xtensa/XtensaProcessors.td
new file mode 100644
index 0000000..0faf07d
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaProcessors.td
@@ -0,0 +1,27 @@
+//===- XtensaProcessors.td - Xtensa Processors -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Xtensa supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+    : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+
+def : Proc<"esp32", [FeatureDensity, FeatureSingleFloat, FeatureLoop, FeatureMAC16, FeatureWindowed, FeatureBoolean, FeatureSEXT,
+                     FeatureNSA, FeatureMul16, FeatureMul32, FeatureMul32High, FeatureDFPAccel, FeatureS32C1I, FeatureTHREADPTR, FeatureDiv32,
+                     FeatureDebug, FeatureException, FeatureHighPriInterrupts, FeatureHighPriInterruptsLevel7, FeatureCoprocessor,
+                     FeatureInterrupt, FeatureDataCache, FeatureRelocatableVector, FeatureTimers3, FeaturePRID, FeatureRegionProtection, FeatureMiscSR,
+                     FeatureMINMAX, FeatureCLAMPS]>;
+
+def : Proc<"esp8266", [FeatureDensity, FeatureNSA, FeatureMul16, FeatureMul32, FeatureExtendedL32R, FeatureDebug, FeatureException,
+                       FeatureHighPriInterrupts, FeatureHighPriInterruptsLevel3, FeatureInterrupt, FeatureRelocatableVector, FeatureTimers1,
+					   FeatureRegionProtection, FeaturePRID]>;
diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt
index 8f8b3a5..62e97bf 100644
--- a/llvm/lib/TargetParser/CMakeLists.txt
+++ b/llvm/lib/TargetParser/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_component_library(LLVMTargetParser
   TargetParser.cpp
   Triple.cpp
   X86TargetParser.cpp
+  XtensaTargetParser.cpp
 
   ADDITIONAL_HEADER_DIRS
   Unix
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 22192e1f..2482753 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -759,20 +759,20 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
   StringRef CPU;
 
   switch (Family) {
-  case 3:
+  case 0x3:
     CPU = "i386";
     break;
-  case 4:
+  case 0x4:
     CPU = "i486";
     break;
-  case 5:
+  case 0x5:
     if (testFeature(X86::FEATURE_MMX)) {
       CPU = "pentium-mmx";
       break;
     }
     CPU = "pentium";
     break;
-  case 6:
+  case 0x6:
     switch (Model) {
     case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
                // processor, Intel Core 2 Quad processor, Intel Core 2 Quad
@@ -1120,7 +1120,7 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
       break;
     }
     break;
-  case 15: {
+  case 0xf: {
     if (testFeature(X86::FEATURE_64BIT)) {
       CPU = "nocona";
       break;
@@ -1132,7 +1132,7 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
     CPU = "pentium4";
     break;
   }
-  case 19:
+  case 0x13:
     switch (Model) {
     // Diamond Rapids:
     case 0x01:
diff --git a/llvm/lib/TargetParser/XtensaTargetParser.cpp b/llvm/lib/TargetParser/XtensaTargetParser.cpp
new file mode 100644
index 0000000..25725f2
--- /dev/null
+++ b/llvm/lib/TargetParser/XtensaTargetParser.cpp
@@ -0,0 +1,93 @@
+//==-- XtensaTargetParser - Parser for Xtensa features ------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise Xtensa hardware features
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TargetParser/XtensaTargetParser.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+
+namespace llvm {
+
+namespace Xtensa {
+struct CPUInfo {
+  StringLiteral Name;
+  CPUKind Kind;
+  uint64_t Features;
+};
+
+struct FeatureName {
+  uint64_t ID;
+  const char *NameCStr;
+  size_t NameLength;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+};
+
+const FeatureName XtensaFeatureNames[] = {
+#define XTENSA_FEATURE(ID, NAME) {ID, "+" NAME, sizeof(NAME)},
+#include "llvm/TargetParser/XtensaTargetParser.def"
+};
+
+constexpr CPUInfo XtensaCPUInfo[] = {
+#define XTENSA_CPU(ENUM, NAME, FEATURES) {NAME, CK_##ENUM, FEATURES},
+#include "llvm/TargetParser/XtensaTargetParser.def"
+};
+
+StringRef getBaseName(StringRef CPU) {
+  return llvm::StringSwitch<StringRef>(CPU)
+#define XTENSA_CPU_ALIAS(NAME, ANAME) .Case(ANAME, NAME)
+#include "llvm/TargetParser/XtensaTargetParser.def"
+      .Default(CPU);
+}
+
+StringRef getAliasName(StringRef CPU) {
+  return llvm::StringSwitch<StringRef>(CPU)
+#define XTENSA_CPU_ALIAS(NAME, ANAME) .Case(NAME, ANAME)
+#include "llvm/TargetParser/XtensaTargetParser.def"
+      .Default(CPU);
+}
+
+CPUKind parseCPUKind(StringRef CPU) {
+  CPU = getBaseName(CPU);
+  return llvm::StringSwitch<CPUKind>(CPU)
+#define XTENSA_CPU(ENUM, NAME, FEATURES) .Case(NAME, CK_##ENUM)
+#include "llvm/TargetParser/XtensaTargetParser.def"
+      .Default(CK_INVALID);
+}
+
+// Get all features for the CPU
+void getCPUFeatures(StringRef CPU, std::vector<StringRef> &Features) {
+  CPU = getBaseName(CPU);
+  auto I = llvm::find_if(XtensaCPUInfo,
+                         [&](const CPUInfo &CI) { return CI.Name == CPU; });
+  assert(I != std::end(XtensaCPUInfo) && "CPU not found!");
+  uint64_t Bits = I->Features;
+
+  for (const auto &F : XtensaFeatureNames) {
+    if ((Bits & F.ID) == F.ID)
+      Features.push_back(F.getName());
+  }
+}
+
+// Find all valid CPUs
+void fillValidCPUList(std::vector<StringRef> &Values) {
+  for (const auto &C : XtensaCPUInfo) {
+    if (C.Kind != CK_INVALID) {
+      Values.emplace_back(C.Name);
+      StringRef Name = getAliasName(C.Name);
+      if (Name != C.Name)
+        Values.emplace_back(Name);
+    }
+  }
+}
+
+} // namespace Xtensa
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 3320508..b775c43 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1821,7 +1821,7 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
       // only used outside the region.
       if (Valid && Lifetimes.size() != 0) {
         auto *NewLifetime = Lifetimes[0]->clone();
-        NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI);
+        NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(0), AI);
         NewLifetime->insertBefore(DomBB->getTerminator()->getIterator());
 
         // All the outsided lifetime.start markers are no longer necessary.
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index ab906f9..180ac9c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -2252,6 +2252,10 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
       UR.CWorklist.insert(CurrentSCC);
       for (Function *Clone : Clones)
         UR.CWorklist.insert(CG.lookupSCC(CG.get(*Clone)));
+    } else if (Shape.ABI == coro::ABI::Async) {
+      // Reprocess the function to inline the tail called return function of
+      // coro.async.end.
+      UR.CWorklist.insert(&C);
     }
   }
 
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index 4e71768..d5d60a3 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -264,11 +264,6 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
   }
 
   void visitIntrinsicInst(IntrinsicInst &II) {
-    // When we found the lifetime markers refers to a
-    // subrange of the original alloca, ignore the lifetime
-    // markers to avoid misleading the analysis.
-    if (!IsOffsetKnown || !Offset.isZero())
-      return Base::visitIntrinsicInst(II);
     switch (II.getIntrinsicID()) {
     default:
       return Base::visitIntrinsicInst(II);
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index da60f52..042578d 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -53,7 +53,6 @@
 
 #include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
@@ -226,13 +225,6 @@ public:
                              /*IsVarArgs=*/false);
   }
 
-  static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
-                                   AllocaInst *Alloced) {
-    std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL);
-    uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0;
-    return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
-  }
-
   bool expansionApplicableToFunction(Module &M, Function *F) {
     if (F->isIntrinsic() || !F->isVarArg() ||
         F->hasFnAttribute(Attribute::Naked))
@@ -577,8 +569,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
   AllocaInst *VaListInstance =
       Builder.CreateAlloca(VaListTy, nullptr, "va_start");
 
-  Builder.CreateLifetimeStart(VaListInstance,
-                              sizeOfAlloca(Ctx, DL, VaListInstance));
+  Builder.CreateLifetimeStart(VaListInstance);
 
   Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)},
                           {VaListInstance});
@@ -595,8 +586,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
 
   Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)},
                           {VaListInstance});
-  Builder.CreateLifetimeEnd(VaListInstance,
-                            sizeOfAlloca(Ctx, DL, VaListInstance));
+  Builder.CreateLifetimeEnd(VaListInstance);
 
   if (Result->getType()->isVoidTy())
     Builder.CreateRetVoid();
@@ -746,7 +736,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
 
   // Initialize the fields in the struct
   Builder.SetInsertPoint(CB);
-  Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+  Builder.CreateLifetimeStart(Alloced);
   Frame.initializeStructAlloca(DL, Builder, Alloced);
 
   const unsigned NumArgs = FuncType->getNumParams();
@@ -762,7 +752,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
       Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
       VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument");
       Builder.SetInsertPoint(CB);
-      Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList));
+      Builder.CreateLifetimeStart(VaList);
     }
     Builder.SetInsertPoint(CB);
     Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced));
@@ -802,9 +792,9 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
   }
 
   if (VaList)
-    Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList));
+    Builder.CreateLifetimeEnd(VaList);
 
-  Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+  Builder.CreateLifetimeEnd(Alloced);
 
   NewCB->setAttributes(PAL);
   NewCB->takeName(CB);
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 8262c8c..44394f6 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -273,7 +273,7 @@ MemoryEffects llvm::computeFunctionBodyMemoryAccess(Function &F,
 /// Deduce readonly/readnone/writeonly attributes for the SCC.
 template <typename AARGetterT>
 static void addMemoryAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter,
-                           SmallSet<Function *, 8> &Changed) {
+                           SmallPtrSet<Function *, 8> &Changed) {
   MemoryEffects ME = MemoryEffects::none();
   MemoryEffects RecursiveArgME = MemoryEffects::none();
   for (Function *F : SCCNodes) {
@@ -1002,7 +1002,7 @@ determinePointerAccessAttrs(Argument *A,
 
 /// Deduce returned attributes for the SCC.
 static void addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes,
-                                     SmallSet<Function *, 8> &Changed) {
+                                     SmallPtrSet<Function *, 8> &Changed) {
   // Check each function in turn, determining if an argument is always returned.
   for (Function *F : SCCNodes) {
     // We can infer and propagate function attributes only when we know that the
@@ -1238,7 +1238,7 @@ static bool inferInitializes(Argument &A, Function &F) {
 
 /// Deduce nocapture attributes for the SCC.
 static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
-                             SmallSet<Function *, 8> &Changed,
+                             SmallPtrSet<Function *, 8> &Changed,
                              bool SkipInitializes) {
   ArgumentGraph AG;
 
@@ -1510,7 +1510,7 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
 
 /// Deduce noalias attributes for the SCC.
 static void addNoAliasAttrs(const SCCNodeSet &SCCNodes,
-                            SmallSet<Function *, 8> &Changed) {
+                            SmallPtrSet<Function *, 8> &Changed) {
   // Check each function in turn, determining which functions return noalias
   // pointers.
   for (Function *F : SCCNodes) {
@@ -1623,7 +1623,7 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
 
 /// Deduce nonnull attributes for the SCC.
 static void addNonNullAttrs(const SCCNodeSet &SCCNodes,
-                            SmallSet<Function *, 8> &Changed) {
+                            SmallPtrSet<Function *, 8> &Changed) {
   // Speculative that all functions in the SCC return only nonnull
   // pointers.  We may refute this as we analyze functions.
   bool SCCReturnsNonNull = true;
@@ -1680,7 +1680,7 @@ static void addNonNullAttrs(const SCCNodeSet &SCCNodes,
 
 /// Deduce noundef attributes for the SCC.
 static void addNoUndefAttrs(const SCCNodeSet &SCCNodes,
-                            SmallSet<Function *, 8> &Changed) {
+                            SmallPtrSet<Function *, 8> &Changed) {
   // Check each function in turn, determining which functions return noundef
   // values.
   for (Function *F : SCCNodes) {
@@ -1788,13 +1788,13 @@ public:
     InferenceDescriptors.push_back(AttrInference);
   }
 
-  void run(const SCCNodeSet &SCCNodes, SmallSet<Function *, 8> &Changed);
+  void run(const SCCNodeSet &SCCNodes, SmallPtrSet<Function *, 8> &Changed);
 };
 
 /// Perform all the requested attribute inference actions according to the
 /// attribute predicates stored before.
 void AttributeInferer::run(const SCCNodeSet &SCCNodes,
-                           SmallSet<Function *, 8> &Changed) {
+                           SmallPtrSet<Function *, 8> &Changed) {
   SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors;
   // Go through all the functions in SCC and check corresponding attribute
   // assumptions for each of them. Attributes that are invalid for this SCC
@@ -1969,7 +1969,7 @@ static bool InstrBreaksNoSync(Instruction &I, const SCCNodeSet &SCCNodes) {
 ///
 /// Returns true if any changes to function attributes were made.
 static void inferConvergent(const SCCNodeSet &SCCNodes,
-                            SmallSet<Function *, 8> &Changed) {
+                            SmallPtrSet<Function *, 8> &Changed) {
   AttributeInferer AI;
 
   // Request to remove the convergent attribute from all functions in the SCC
@@ -2000,7 +2000,7 @@ static void inferConvergent(const SCCNodeSet &SCCNodes,
 ///
 /// Returns true if any changes to function attributes were made.
 static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes,
-                                         SmallSet<Function *, 8> &Changed) {
+                                         SmallPtrSet<Function *, 8> &Changed) {
   AttributeInferer AI;
 
   if (!DisableNoUnwindInference)
@@ -2069,7 +2069,7 @@ static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes,
 }
 
 static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
-                              SmallSet<Function *, 8> &Changed) {
+                              SmallPtrSet<Function *, 8> &Changed) {
   // Try and identify functions that do not recurse.
 
   // If the SCC contains multiple nodes we know for sure there is recursion.
@@ -2105,7 +2105,7 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
 
 // Set the noreturn function attribute if possible.
 static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
-                             SmallSet<Function *, 8> &Changed) {
+                             SmallPtrSet<Function *, 8> &Changed) {
   for (Function *F : SCCNodes) {
     if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) ||
         F->doesNotReturn())
@@ -2166,7 +2166,7 @@ static bool allPathsGoThroughCold(Function &F) {
 
 // Set the cold function attribute if possible.
 static void addColdAttrs(const SCCNodeSet &SCCNodes,
-                         SmallSet<Function *, 8> &Changed) {
+                         SmallPtrSet<Function *, 8> &Changed) {
   for (Function *F : SCCNodes) {
     if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) ||
         F->hasFnAttribute(Attribute::Cold) || F->hasFnAttribute(Attribute::Hot))
@@ -2213,7 +2213,7 @@ static bool functionWillReturn(const Function &F) {
 
 // Set the willreturn function attribute if possible.
 static void addWillReturn(const SCCNodeSet &SCCNodes,
-                          SmallSet<Function *, 8> &Changed) {
+                          SmallPtrSet<Function *, 8> &Changed) {
   for (Function *F : SCCNodes) {
     if (!F || F->willReturn() || !functionWillReturn(*F))
       continue;
@@ -2239,7 +2239,7 @@ static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
 }
 
 template <typename AARGetterT>
-static SmallSet<Function *, 8>
+static SmallPtrSet<Function *, 8>
 deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
                        bool ArgAttrsOnly) {
   SCCNodesResult Nodes = createSCCNodeSet(Functions);
@@ -2248,7 +2248,7 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
   if (Nodes.SCCNodes.empty())
     return {};
 
-  SmallSet<Function *, 8> Changed;
+  SmallPtrSet<Function *, 8> Changed;
   if (ArgAttrsOnly) {
     // ArgAttrsOnly means to only infer attributes that may aid optimizations
     // on the *current* function. "initializes" attribute is to aid
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 45fa9d5..9196a01 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Analysis/ValueLattice.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/SCCPSolver.h"
@@ -400,12 +399,6 @@ Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) {
 Constant *InstCostVisitor::visitCallBase(CallBase &I) {
   assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
 
-  // Look through calls to ssa_copy intrinsics.
-  if (auto *II = dyn_cast<IntrinsicInst>(&I);
-      II && II->getIntrinsicID() == Intrinsic::ssa_copy) {
-    return LastVisited->second;
-  }
-
   Function *F = I.getCalledFunction();
   if (!F || !canConstantFoldCallTo(&I, F))
     return nullptr;
@@ -611,17 +604,15 @@ void FunctionSpecializer::promoteConstantStackValues(Function *F) {
   }
 }
 
-// ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics
-// interfere with the promoteConstantStackValues() optimization.
+// The SCCP solver inserts bitcasts for PredicateInfo. These interfere with the
+// promoteConstantStackValues() optimization.
 static void removeSSACopy(Function &F) {
   for (BasicBlock &BB : F) {
     for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
-      auto *II = dyn_cast<IntrinsicInst>(&Inst);
-      if (!II)
-        continue;
-      if (II->getIntrinsicID() != Intrinsic::ssa_copy)
+      auto *BC = dyn_cast<BitCastInst>(&Inst);
+      if (!BC || BC->getType() != BC->getOperand(0)->getType())
         continue;
-      Inst.replaceAllUsesWith(II->getOperand(0));
+      Inst.replaceAllUsesWith(BC->getOperand(0));
       Inst.eraseFromParent();
     }
   }
diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 45fb1f5..c576fbc 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -21,6 +21,8 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
@@ -30,6 +32,35 @@ using namespace llvm;
 
 #define DEBUG_TYPE "globaldce"
 
+namespace {
+class GlobalDCELegacyPass : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  GlobalDCELegacyPass() : ModulePass(ID) {
+    initializeGlobalDCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    // Note: GlobalDCEPass does not use any analyses, so we're safe to call the
+    // new-pm style pass with a default-initialized analysis manager here
+    ModuleAnalysisManager MAM;
+    auto PA = Impl.run(M, MAM);
+    return !PA.areAllPreserved();
+  }
+
+private:
+  GlobalDCEPass Impl;
+};
+} // namespace
+
+char GlobalDCELegacyPass::ID = 0;
+INITIALIZE_PASS(GlobalDCELegacyPass, "globaldce", "Dead Global Elimination",
+                false, false)
+
+// Public interface to the GlobalDCEPass.
+ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCELegacyPass(); }
+
 static cl::opt<bool>
     ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true),
                 cl::desc("Enable virtual function elimination"));
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index bdda498..d7edd12 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1133,9 +1133,6 @@ static bool
 optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
                          const DataLayout &DL,
                          function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
-  // Ignore no-op GEPs and bitcasts.
-  StoredOnceVal = StoredOnceVal->stripPointerCasts();
-
   // If we are dealing with a pointer global that is initialized to null and
   // only has one (non-null) value stored into it, then we can optimize any
   // users of the loaded value (often calls and loads) that would trap if the
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 6554377..88f5ca0 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -459,6 +459,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                              }),
               Calls.end());
 
+          // Report inlining decision BEFORE deleting function contents, so we
+          // can still access e.g. the DebugLoc
+          Advice->recordInliningWithCalleeDeleted();
           // Clear the body and queue the function itself for call graph
           // updating when we finish inlining.
           makeFunctionBodyUnreachable(Callee);
@@ -470,9 +473,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
           DeadFunctionsInComdats.push_back(&Callee);
         }
       }
-      if (CalleeWasDeleted)
-        Advice->recordInliningWithCalleeDeleted();
-      else
+      if (!CalleeWasDeleted)
         Advice->recordInlining();
     }
 
diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
index 844e275..1185e63 100644
--- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp
+++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
@@ -284,6 +284,10 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
         Calls->erase_if([&](const std::pair<CallBase *, int> &Call) {
           return Call.first->getCaller() == &Callee;
         });
+
+        // Report inlining decision BEFORE deleting function contents, so we
+        // can still access e.g. the DebugLoc
+        Advice->recordInliningWithCalleeDeleted();
         // Clear the body and queue the function itself for deletion when we
         // finish inlining.
         // Note that after this point, it is an error to do anything other
@@ -295,9 +299,7 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
         CalleeWasDeleted = true;
       }
     }
-    if (CalleeWasDeleted)
-      Advice->recordInliningWithCalleeDeleted();
-    else
+    if (!CalleeWasDeleted)
       Advice->recordInlining();
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index d7971e8..6e46898 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3740,6 +3740,82 @@ static Instruction *foldIntegerPackFromVector(Instruction &I,
   return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType());
 }
 
+/// Match \p V as "lshr -> mask -> zext -> shl".
+///
+/// \p Int is the underlying integer being extracted from.
+/// \p Mask is a bitmask identifying which bits of the integer are being
+/// extracted. \p Offset identifies which bit of the result \p V corresponds to
+/// the least significant bit of \p Int
+static bool matchZExtedSubInteger(Value *V, Value *&Int, APInt &Mask,
+                                  uint64_t &Offset, bool &IsShlNUW,
+                                  bool &IsShlNSW) {
+  Value *ShlOp0;
+  uint64_t ShlAmt = 0;
+  if (!match(V, m_OneUse(m_Shl(m_Value(ShlOp0), m_ConstantInt(ShlAmt)))))
+    return false;
+
+  IsShlNUW = cast<BinaryOperator>(V)->hasNoUnsignedWrap();
+  IsShlNSW = cast<BinaryOperator>(V)->hasNoSignedWrap();
+
+  Value *ZExtOp0;
+  if (!match(ShlOp0, m_OneUse(m_ZExt(m_Value(ZExtOp0)))))
+    return false;
+
+  Value *MaskedOp0;
+  const APInt *ShiftedMaskConst = nullptr;
+  if (!match(ZExtOp0, m_CombineOr(m_OneUse(m_And(m_Value(MaskedOp0),
+                                                 m_APInt(ShiftedMaskConst))),
+                                  m_Value(MaskedOp0))))
+    return false;
+
+  uint64_t LShrAmt = 0;
+  if (!match(MaskedOp0,
+             m_CombineOr(m_OneUse(m_LShr(m_Value(Int), m_ConstantInt(LShrAmt))),
+                         m_Value(Int))))
+    return false;
+
+  if (LShrAmt > ShlAmt)
+    return false;
+  Offset = ShlAmt - LShrAmt;
+
+  Mask = ShiftedMaskConst ? ShiftedMaskConst->shl(LShrAmt)
+                          : APInt::getBitsSetFrom(
+                                Int->getType()->getScalarSizeInBits(), LShrAmt);
+
+  return true;
+}
+
+/// Try to fold the join of two scalar integers whose bits are unpacked and
+/// zexted from the same source integer.
+static Value *foldIntegerRepackThroughZExt(Value *Lhs, Value *Rhs,
+                                           InstCombiner::BuilderTy &Builder) {
+
+  Value *LhsInt, *RhsInt;
+  APInt LhsMask, RhsMask;
+  uint64_t LhsOffset, RhsOffset;
+  bool IsLhsShlNUW, IsLhsShlNSW, IsRhsShlNUW, IsRhsShlNSW;
+  if (!matchZExtedSubInteger(Lhs, LhsInt, LhsMask, LhsOffset, IsLhsShlNUW,
+                             IsLhsShlNSW))
+    return nullptr;
+  if (!matchZExtedSubInteger(Rhs, RhsInt, RhsMask, RhsOffset, IsRhsShlNUW,
+                             IsRhsShlNSW))
+    return nullptr;
+  if (LhsInt != RhsInt || LhsOffset != RhsOffset)
+    return nullptr;
+
+  APInt Mask = LhsMask | RhsMask;
+
+  Type *DestTy = Lhs->getType();
+  Value *Res = Builder.CreateShl(
+      Builder.CreateZExt(
+          Builder.CreateAnd(LhsInt, Mask, LhsInt->getName() + ".mask"), DestTy,
+          LhsInt->getName() + ".zext"),
+      ConstantInt::get(DestTy, LhsOffset), "", IsLhsShlNUW && IsRhsShlNUW,
+      IsLhsShlNSW && IsRhsShlNSW);
+  Res->takeName(Lhs);
+  return Res;
+}
+
 // A decomposition of ((X & Mask) * Factor). The NUW / NSW bools
 // track these properities for preservation. Note that we can decompose
 // equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask *
@@ -3841,6 +3917,8 @@ static Value *foldBitmaskMul(Value *Op0, Value *Op1,
 Value *InstCombinerImpl::foldDisjointOr(Value *LHS, Value *RHS) {
   if (Value *Res = foldBitmaskMul(LHS, RHS, Builder))
     return Res;
+  if (Value *Res = foldIntegerRepackThroughZExt(LHS, RHS, Builder))
+    return Res;
 
   return nullptr;
 }
@@ -3973,7 +4051,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
                                    /*NSW=*/true, /*NUW=*/true))
       return R;
 
-    if (Value *Res = foldBitmaskMul(I.getOperand(0), I.getOperand(1), Builder))
+    if (Value *Res = foldDisjointOr(I.getOperand(0), I.getOperand(1)))
       return replaceInstUsesWith(I, Res);
 
     if (Value *Res = reassociateDisjointOr(I.getOperand(0), I.getOperand(1)))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 47e017e..2433534 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -267,12 +267,10 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
         MI->getContext(), APInt::getSplat(Len * 8, FillC->getValue()));
     StoreInst *S = Builder.CreateStore(FillVal, Dest, MI->isVolatile());
     S->copyMetadata(*MI, LLVMContext::MD_DIAssignID);
-    auto replaceOpForAssignmentMarkers = [FillC, FillVal](auto *DbgAssign) {
+    for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(S)) {
       if (llvm::is_contained(DbgAssign->location_ops(), FillC))
         DbgAssign->replaceVariableLocationOp(FillC, FillVal);
-    };
-    for_each(at::getAssignmentMarkers(S), replaceOpForAssignmentMarkers);
-    for_each(at::getDVRAssignmentMarkers(S), replaceOpForAssignmentMarkers);
+    }
 
     S->setAlignment(Alignment);
     if (MI->isAtomic())
@@ -1532,6 +1530,51 @@ static Instruction *foldBitOrderCrossLogicOp(Value *V,
   return nullptr;
 }
 
+/// Helper to match idempotent binary intrinsics, namely, intrinsics where
+/// `f(f(x, y), y) == f(x, y)` holds.
+static bool isIdempotentBinaryIntrinsic(Intrinsic::ID IID) {
+  switch (IID) {
+  case Intrinsic::smax:
+  case Intrinsic::smin:
+  case Intrinsic::umax:
+  case Intrinsic::umin:
+  case Intrinsic::maximum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximumnum:
+  case Intrinsic::minimumnum:
+  case Intrinsic::maxnum:
+  case Intrinsic::minnum:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// Attempt to simplify value-accumulating recurrences of kind:
+///   %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ]
+///   %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+/// And let the idempotent binary intrinsic be hoisted, when the operands are
+/// known to be loop-invariant.
+static Value *foldIdempotentBinaryIntrinsicRecurrence(InstCombinerImpl &IC,
+                                                      IntrinsicInst *II) {
+  PHINode *PN;
+  Value *Init, *OtherOp;
+
+  // A binary intrinsic recurrence with loop-invariant operands is equivalent to
+  // `call @llvm.binary.intrinsic(Init, OtherOp)`.
+  auto IID = II->getIntrinsicID();
+  if (!isIdempotentBinaryIntrinsic(IID) ||
+      !matchSimpleBinaryIntrinsicRecurrence(II, PN, Init, OtherOp) ||
+      !IC.getDominatorTree().dominates(OtherOp, PN))
+    return nullptr;
+
+  auto *InvariantBinaryInst =
+      IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp);
+  if (isa<FPMathOperator>(InvariantBinaryInst))
+    cast<Instruction>(InvariantBinaryInst)->copyFastMathFlags(II);
+  return InvariantBinaryInst;
+}
+
 static Value *simplifyReductionOperand(Value *Arg, bool CanReorderLanes) {
   if (!CanReorderLanes)
     return nullptr;
@@ -3912,6 +3955,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   if (Value *Reverse = foldReversedIntrinsicOperands(II))
     return replaceInstUsesWith(*II, Reverse);
 
+  if (Value *Res = foldIdempotentBinaryIntrinsicRecurrence(*this, II))
+    return replaceInstUsesWith(*II, Res);
+
   // Some intrinsics (like experimental_gc_statepoint) can be used in invoke
   // context, so it is handled in visitCallBase and we should trigger it.
   return visitCallBase(*II);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index a43a6ee..801ac00 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1131,11 +1131,10 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
   case Instruction::Shl: {
     // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the
     // upper bits we can reduce BitsToClear by the shift amount.
-    const APInt *Amt;
-    if (match(I->getOperand(1), m_APInt(Amt))) {
+    uint64_t ShiftAmt;
+    if (match(I->getOperand(1), m_ConstantInt(ShiftAmt))) {
       if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
         return false;
-      uint64_t ShiftAmt = Amt->getZExtValue();
       BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
       return true;
     }
@@ -1144,11 +1143,11 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
   case Instruction::LShr: {
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
-    const APInt *Amt;
-    if (match(I->getOperand(1), m_APInt(Amt))) {
+    uint64_t ShiftAmt;
+    if (match(I->getOperand(1), m_ConstantInt(ShiftAmt))) {
       if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
         return false;
-      BitsToClear += Amt->getZExtValue();
+      BitsToClear += ShiftAmt;
       if (BitsToClear > V->getType()->getScalarSizeInBits())
         BitsToClear = V->getType()->getScalarSizeInBits();
       return true;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index cf94d28..2386e7a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1320,6 +1320,35 @@ Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
   return nullptr;
 }
 
+/// Fold icmp eq (num + mask) & ~mask, num
+///      to
+///      icmp eq (and num, mask), 0
+/// Where mask is a low bit mask.
+Instruction *InstCombinerImpl::foldIsMultipleOfAPowerOfTwo(ICmpInst &Cmp) {
+  Value *Num;
+  CmpPredicate Pred;
+  const APInt *Mask, *Neg;
+
+  if (!match(&Cmp,
+             m_c_ICmp(Pred, m_Value(Num),
+                      m_OneUse(m_c_And(m_OneUse(m_c_Add(m_Deferred(Num),
+                                                        m_LowBitMask(Mask))),
+                                       m_APInt(Neg))))))
+    return nullptr;
+
+  if (*Neg != ~*Mask)
+    return nullptr;
+
+  if (!ICmpInst::isEquality(Pred))
+    return nullptr;
+
+  // Create new icmp eq (num & mask), 0
+  auto *NewAnd = Builder.CreateAnd(Num, *Mask);
+  auto *Zero = Constant::getNullValue(Num->getType());
+
+  return new ICmpInst(Pred, NewAnd, Zero);
+}
+
 /// Fold icmp Pred X, C.
 /// TODO: This code structure does not make sense. The saturating add fold
 /// should be moved to some other helper and extended as noted below (it is also
@@ -1521,11 +1550,11 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
   // trunc iN (ShOp >> ShAmtC) to i[N - ShAmtC] < 0  --> ShOp <  0
   // trunc iN (ShOp >> ShAmtC) to i[N - ShAmtC] > -1 --> ShOp > -1
   Value *ShOp;
-  const APInt *ShAmtC;
+  uint64_t ShAmt;
   bool TrueIfSigned;
   if (isSignBitCheck(Pred, C, TrueIfSigned) &&
-      match(X, m_Shr(m_Value(ShOp), m_APInt(ShAmtC))) &&
-      DstBits == SrcBits - ShAmtC->getZExtValue()) {
+      match(X, m_Shr(m_Value(ShOp), m_ConstantInt(ShAmt))) &&
+      DstBits == SrcBits - ShAmt) {
     return TrueIfSigned ? new ICmpInst(ICmpInst::ICMP_SLT, ShOp,
                                        ConstantInt::getNullValue(SrcTy))
                         : new ICmpInst(ICmpInst::ICMP_SGT, ShOp,
@@ -7644,6 +7673,9 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpUsingKnownBits(I))
     return Res;
 
+  if (Instruction *Res = foldIsMultipleOfAPowerOfTwo(I))
+    return Res;
+
   // Test if the ICmpInst instruction is used exclusively by a select as
   // part of a minimum or maximum operation. If so, refrain from doing
   // any other folding. This helps out other analyses which understand
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index c67e27e..2340028 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -721,6 +721,7 @@ public:
   Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
   Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
   Instruction *foldICmpWithConstant(ICmpInst &Cmp);
+  Instruction *foldIsMultipleOfAPowerOfTwo(ICmpInst &Cmp);
   Instruction *foldICmpUsingBoolRange(ICmpInst &I);
   Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 0be1034..4b10586 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -737,6 +737,8 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
       LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
                                                   ".unpack");
       NewLoad->setAAMetadata(LI.getAAMetadata());
+      // Copy invariant metadata from parent load.
+      NewLoad->copyMetadata(LI, LLVMContext::MD_invariant_load);
       return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
         PoisonValue::get(T), NewLoad, 0, Name));
     }
@@ -764,6 +766,8 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
           Name + ".unpack");
       // Propagate AA metadata. It'll still be valid on the narrowed load.
       L->setAAMetadata(LI.getAAMetadata());
+      // Copy invariant metadata from parent load.
+      L->copyMetadata(LI, LLVMContext::MD_invariant_load);
       V = IC.Builder.CreateInsertValue(V, L, i);
     }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index fe0f308..b17cf17 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -3042,7 +3042,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     Value *V = LHS;
     unsigned MaskElems = Mask.size();
     auto *SrcTy = cast<FixedVectorType>(V->getType());
-    unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedValue();
+    unsigned VecBitWidth = DL.getTypeSizeInBits(SrcTy);
     unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
     assert(SrcElemBitWidth && "vector elements must have a bitwidth");
     unsigned SrcNumElems = SrcTy->getNumElements();
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8da65c5..50258af 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1211,23 +1211,19 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
       return;
     if (!II.isLifetimeStartOrEnd())
       return;
-    // Found lifetime intrinsic, add ASan instrumentation if necessary.
-    auto *Size = cast<ConstantInt>(II.getArgOperand(0));
-    // If size argument is undefined, don't do anything.
-    if (Size->isMinusOne()) return;
-    // Check that size doesn't saturate uint64_t and can
-    // be stored in IntptrTy.
-    const uint64_t SizeValue = Size->getValue().getLimitedValue();
-    if (SizeValue == ~0ULL ||
-        !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
-      return;
     // Find alloca instruction that corresponds to llvm.lifetime argument.
-    AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(1));
+    AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(0));
     // We're interested only in allocas we can handle.
     if (!AI || !ASan.isInterestingAlloca(*AI))
       return;
+
+    std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout());
+    // Check that size is known and can be stored in IntptrTy.
+    if (!Size || !ConstantInt::isValueValidForType(IntptrTy, *Size))
+      return;
+
     bool DoPoison = (ID == Intrinsic::lifetime_end);
-    AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
+    AllocaPoisonCall APC = {&II, AI, *Size, DoPoison};
     if (AI->isStaticAlloca())
       StaticAllocaPoisonCallVec.push_back(APC);
     else if (ClInstrumentDynamicAllocas)
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index bcb90d6..fc34d14 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1469,22 +1469,6 @@ void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
     size_t Size = memtag::getAllocaSizeInBytes(*AI);
     size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
 
-    auto HandleLifetime = [&](IntrinsicInst *II) {
-      // Set the lifetime intrinsic to cover the whole alloca. This reduces the
-      // set of assumptions we need to make about the lifetime. Without this we
-      // would need to ensure that we can track the lifetime pointer to a
-      // constant offset from the alloca, and would still need to change the
-      // size to include the extra alignment we use for the untagging to make
-      // the size consistent.
-      //
-      // The check for standard lifetime below makes sure that we have exactly
-      // one set of start / end in any execution (i.e. the ends are not
-      // reachable from each other), so this will not cause any problems.
-      II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize));
-    };
-    llvm::for_each(Info.LifetimeStart, HandleLifetime);
-    llvm::for_each(Info.LifetimeEnd, HandleLifetime);
-
     AI->replaceUsesWithIf(Replacement, [AILong](const Use &U) {
       auto *User = U.getUser();
       return User != AILong && !isa<LifetimeIntrinsic>(User);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 7d3c940..948e2c6 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2690,6 +2690,54 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     SC.Done(&I);
   }
 
+  // Perform a bitwise OR on the horizontal pairs (or other specified grouping)
+  // of elements.
+  //
+  // For example, suppose we have:
+  //   VectorA: <a1, a2, a3, a4, a5, a6>
+  //   VectorB: <b1, b2, b3, b4, b5, b6>
+  //   ReductionFactor: 3.
+  // The output would be:
+  //   <a1|a2|a3, a4|a5|a6, b1|b2|b3, b4|b5|b6>
+  //
+  // This is convenient for instrumenting horizontal add/sub.
+  // For bitwise OR on "vertical" pairs, see maybeHandleSimpleNomemIntrinsic().
+  Value *horizontalReduce(IntrinsicInst &I, unsigned ReductionFactor,
+                          Value *VectorA, Value *VectorB) {
+    assert(isa<FixedVectorType>(VectorA->getType()));
+    unsigned TotalNumElems =
+        cast<FixedVectorType>(VectorA->getType())->getNumElements();
+
+    if (VectorB) {
+      assert(VectorA->getType() == VectorB->getType());
+      TotalNumElems = TotalNumElems * 2;
+    }
+
+    assert(TotalNumElems % ReductionFactor == 0);
+
+    Value *Or = nullptr;
+
+    IRBuilder<> IRB(&I);
+    for (unsigned i = 0; i < ReductionFactor; i++) {
+      SmallVector<int, 16> Mask;
+      for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor)
+        Mask.push_back(X + i);
+
+      Value *Masked;
+      if (VectorB)
+        Masked = IRB.CreateShuffleVector(VectorA, VectorB, Mask);
+      else
+        Masked = IRB.CreateShuffleVector(VectorA, Mask);
+
+      if (Or)
+        Or = IRB.CreateOr(Or, Masked);
+      else
+        Or = Masked;
+    }
+
+    return Or;
+  }
+
   /// Propagate shadow for 1- or 2-vector intrinsics that combine adjacent
   /// fields.
   ///
@@ -2701,7 +2749,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     assert(I.getType()->isVectorTy());
     assert(I.getArgOperand(0)->getType()->isVectorTy());
 
-    FixedVectorType *ParamType =
+    [[maybe_unused]] FixedVectorType *ParamType =
         cast<FixedVectorType>(I.getArgOperand(0)->getType());
     assert((I.arg_size() != 2) ||
            (ParamType == cast<FixedVectorType>(I.getArgOperand(1)->getType())));
@@ -2711,31 +2759,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
            2 * ReturnType->getNumElements());
 
     IRBuilder<> IRB(&I);
-    unsigned Width = ParamType->getNumElements() * I.arg_size();
 
     // Horizontal OR of shadow
-    SmallVector<int, 8> EvenMask;
-    SmallVector<int, 8> OddMask;
-    for (unsigned X = 0; X < Width; X += 2) {
-      EvenMask.push_back(X);
-      OddMask.push_back(X + 1);
-    }
-
     Value *FirstArgShadow = getShadow(&I, 0);
-    Value *EvenShadow;
-    Value *OddShadow;
-    if (I.arg_size() == 2) {
-      Value *SecondArgShadow = getShadow(&I, 1);
-      EvenShadow =
-          IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask);
-      OddShadow =
-          IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask);
-    } else {
-      EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask);
-      OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask);
-    }
+    Value *SecondArgShadow = nullptr;
+    if (I.arg_size() == 2)
+      SecondArgShadow = getShadow(&I, 1);
+
+    Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow,
+                                       SecondArgShadow);
 
-    Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
     OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
 
     setShadow(&I, OrShadow);
@@ -2768,23 +2801,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     IRBuilder<> IRB(&I);
 
-    unsigned TotalNumElems = ParamType->getNumElements() * I.arg_size();
     FixedVectorType *ReinterpretShadowTy = nullptr;
     assert(isAligned(Align(ReinterpretElemWidth),
                      ParamType->getPrimitiveSizeInBits()));
     ReinterpretShadowTy = FixedVectorType::get(
         IRB.getIntNTy(ReinterpretElemWidth),
         ParamType->getPrimitiveSizeInBits() / ReinterpretElemWidth);
-    TotalNumElems = ReinterpretShadowTy->getNumElements() * I.arg_size();
 
     // Horizontal OR of shadow
-    SmallVector<int, 8> EvenMask;
-    SmallVector<int, 8> OddMask;
-    for (unsigned X = 0; X < TotalNumElems - 1; X += 2) {
-      EvenMask.push_back(X);
-      OddMask.push_back(X + 1);
-    }
-
     Value *FirstArgShadow = getShadow(&I, 0);
     FirstArgShadow = IRB.CreateBitCast(FirstArgShadow, ReinterpretShadowTy);
 
@@ -2796,22 +2820,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         Align(2),
         cast<FixedVectorType>(FirstArgShadow->getType())->getNumElements()));
 
-    Value *EvenShadow;
-    Value *OddShadow;
+    Value *SecondArgShadow = nullptr;
     if (I.arg_size() == 2) {
-      Value *SecondArgShadow = getShadow(&I, 1);
+      SecondArgShadow = getShadow(&I, 1);
       SecondArgShadow = IRB.CreateBitCast(SecondArgShadow, ReinterpretShadowTy);
-
-      EvenShadow =
-          IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask);
-      OddShadow =
-          IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask);
-    } else {
-      EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask);
-      OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask);
     }
 
-    Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
+    Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow,
+                                       SecondArgShadow);
+
     OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
 
     setShadow(&I, OrShadow);
@@ -3219,7 +3236,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Caller guarantees that this intrinsic does not access memory.
   ///
   /// TODO: "horizontal"/"pairwise" intrinsics are often incorrectly matched by
-  ///       by this handler.
+  ///       by this handler. See horizontalReduce().
+  ///
+  /// TODO: permutation intrinsics are also often incorrectly matched.
   [[maybe_unused]] bool
   maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I,
                                   unsigned int trailingFlags) {
@@ -3301,7 +3320,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void handleLifetimeStart(IntrinsicInst &I) {
     if (!PoisonStack)
       return;
-    AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(1));
+    AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(0));
     if (AI)
       LifetimeStartList.push_back(std::make_pair(&I, AI));
   }
@@ -3624,9 +3643,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Get an MMX-sized vector type.
-  Type *getMMXVectorTy(unsigned EltSizeInBits) {
-    const unsigned X86_MMXSizeInBits = 64;
+  // Get an MMX-sized (64-bit) vector type, or optionally, other sized
+  // vectors.
+  Type *getMMXVectorTy(unsigned EltSizeInBits,
+                       unsigned X86_MMXSizeInBits = 64) {
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
            "Illegal MMX vector element size");
     return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3826,20 +3846,133 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Instrument multiply-add intrinsic.
-  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
-                                  unsigned MMXEltSizeInBits = 0) {
-    Type *ResTy =
-        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
+  // Instrument multiply-add(-accumulate)? intrinsics.
+  //
+  // e.g., Two operands:
+  //         <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
+  //
+  //       Two operands which require an EltSizeInBits override:
+  //         <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
+  //
+  //       Three operands:
+  //         <4 x i32> @llvm.x86.avx512.vpdpbusd.128
+  //                       (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
+  //         (this is equivalent to multiply-add on %a and %b, followed by
+  //          adding/"accumulating" %s. "Accumulation" stores the result in one
+  //          of the source registers, but this accumulate vs. add distinction
+  //          is lost when dealing with LLVM intrinsics.)
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+                                  unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
-    auto *Shadow0 = getShadow(&I, 0);
-    auto *Shadow1 = getShadow(&I, 1);
-    Value *S = IRB.CreateOr(Shadow0, Shadow1);
-    S = IRB.CreateBitCast(S, ResTy);
-    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
-                       ResTy);
-    S = IRB.CreateBitCast(S, getShadowTy(&I));
-    setShadow(&I, S);
+
+    [[maybe_unused]] FixedVectorType *ReturnType =
+        cast<FixedVectorType>(I.getType());
+    assert(isa<FixedVectorType>(ReturnType));
+
+    // Vectors A and B, and shadows
+    Value *Va = nullptr;
+    Value *Vb = nullptr;
+    Value *Sa = nullptr;
+    Value *Sb = nullptr;
+
+    assert(I.arg_size() == 2 || I.arg_size() == 3);
+    if (I.arg_size() == 2) {
+      Va = I.getOperand(0);
+      Vb = I.getOperand(1);
+
+      Sa = getShadow(&I, 0);
+      Sb = getShadow(&I, 1);
+    } else if (I.arg_size() == 3) {
+      // Operand 0 is the accumulator. We will deal with that below.
+      Va = I.getOperand(1);
+      Vb = I.getOperand(2);
+
+      Sa = getShadow(&I, 1);
+      Sb = getShadow(&I, 2);
+    }
+
+    FixedVectorType *ParamType = cast<FixedVectorType>(Va->getType());
+    assert(ParamType == Vb->getType());
+
+    assert(ParamType->getPrimitiveSizeInBits() ==
+           ReturnType->getPrimitiveSizeInBits());
+
+    if (I.arg_size() == 3) {
+      assert(ParamType == ReturnType);
+      assert(ParamType == I.getArgOperand(0)->getType());
+    }
+
+    FixedVectorType *ImplicitReturnType = ReturnType;
+    // Step 1: instrument multiplication of corresponding vector elements
+    if (EltSizeInBits) {
+      ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy(
+          EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
+      ParamType = cast<FixedVectorType>(
+          getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
+
+      Va = IRB.CreateBitCast(Va, ParamType);
+      Vb = IRB.CreateBitCast(Vb, ParamType);
+
+      Sa = IRB.CreateBitCast(Sa, getShadowTy(ParamType));
+      Sb = IRB.CreateBitCast(Sb, getShadowTy(ParamType));
+    } else {
+      assert(ParamType->getNumElements() ==
+             ReturnType->getNumElements() * ReductionFactor);
+    }
+
+    // Multiplying an *initialized* zero by an uninitialized element results in
+    // an initialized zero element.
+    //
+    // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+    // results in an unpoisoned value. We can therefore adapt the visitAnd()
+    // instrumentation:
+    //   OutShadow =   (SaNonZero & SbNonZero)
+    //               | (VaNonZero & SbNonZero)
+    //               | (SaNonZero & VbNonZero)
+    //   where non-zero is checked on a per-element basis (not per bit).
+    Value *SZero = Constant::getNullValue(Va->getType());
+    Value *VZero = Constant::getNullValue(Sa->getType());
+    Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
+    Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
+    Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
+    Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
+
+    Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+    Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+    Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+    // Each element of the vector is represented by a single bit (poisoned or
+    // not) e.g., <8 x i1>.
+    Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+
+    // Extend <8 x i1> to <8 x i16>.
+    // (The real pmadd intrinsic would have computed intermediate values of
+    // <8 x i32>, but that is irrelevant for our shadow purposes because we
+    // consider each element to be either fully initialized or fully
+    // uninitialized.)
+    And = IRB.CreateSExt(And, Sa->getType());
+
+    // Step 2: instrument horizontal add
+    // We don't need bit-precise horizontalReduce because we only want to check
+    // if each pair of elements is fully zero.
+    // Cast to <4 x i32>.
+    Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType);
+
+    // Compute <4 x i1>, then extend back to <4 x i32>.
+    Value *OutShadow = IRB.CreateSExt(
+        IRB.CreateICmpNE(Horizontal,
+                         Constant::getNullValue(Horizontal->getType())),
+        ImplicitReturnType);
+
+    // Cast it back to the required fake return type (<1 x i64>).
+    if (EltSizeInBits)
+      OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
+
+    // Step 3 (if applicable): instrument accumulator
+    if (I.arg_size() == 3)
+      OutShadow = IRB.CreateOr(OutShadow, getShadow(&I, 0));
+
+    setShadow(&I, OutShadow);
     setOriginForNaryOp(I);
   }
 
@@ -5374,21 +5507,185 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorSadIntrinsic(I);
       break;
 
+    // Multiply and Add Packed Words
+    //   < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Packed Signed and Unsigned Bytes
+    //   < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
+    //   <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
+    //   <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
+    //
+    // These intrinsics are auto-upgraded into non-masked forms:
+    //   < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128
+    //                  (<8 x i16>, <8 x i16>, <4 x i32>, i8)
+    //   < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256
+    //                  (<16 x i16>, <16 x i16>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512
+    //                  (<32 x i16>, <32 x i16>, <16 x i32>, i16)
+    //   < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128
+    //                  (<16 x i8>, <16 x i8>, <8 x i16>, i8)
+    //   <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256
+    //                  (<32 x i8>, <32 x i8>, <16 x i16>, i16)
+    //   <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512
+    //                  (<64 x i8>, <64 x i8>, <32 x i16>, i32)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
+    case Intrinsic::x86_avx512_pmaddw_d_512:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I);
+    case Intrinsic::x86_avx512_pmaddubs_w_512:
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
       break;
 
+    // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, 8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
       break;
 
+    // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, 16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
       break;
 
+    // AVX Vector Neural Network Instructions: bytes
+    //
+    // Multiply and Add Packed Signed and Unsigned Bytes
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpbusd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // Multiply and Add Unsigned and Signed Bytes With Saturation
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpbusds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpbssd.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssds.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpbssds.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // These intrinsics are auto-upgraded into non-masked forms:
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
+    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
+    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
+    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
+    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
+    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
+    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
+    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
+    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    case Intrinsic::x86_avx512_vpdpbusd_128:
+    case Intrinsic::x86_avx512_vpdpbusd_256:
+    case Intrinsic::x86_avx512_vpdpbusd_512:
+    case Intrinsic::x86_avx512_vpdpbusds_128:
+    case Intrinsic::x86_avx512_vpdpbusds_256:
+    case Intrinsic::x86_avx512_vpdpbusds_512:
+    case Intrinsic::x86_avx2_vpdpbssd_128:
+    case Intrinsic::x86_avx2_vpdpbssd_256:
+    case Intrinsic::x86_avx2_vpdpbssds_128:
+    case Intrinsic::x86_avx2_vpdpbssds_256:
+    case Intrinsic::x86_avx10_vpdpbssd_512:
+    case Intrinsic::x86_avx10_vpdpbssds_512:
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+      break;
+
+    // AVX Vector Neural Network Instructions: words
+    //
+    // Multiply and Add Signed Word Integers
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // Multiply and Add Signed Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // These intrinsics are auto-upgraded into non-masked forms:
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
+    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
+    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
+    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
+    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
+    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
+    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
+    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
+    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
+    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
+    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
+    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
+    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    case Intrinsic::x86_avx512_vpdpwssd_128:
+    case Intrinsic::x86_avx512_vpdpwssd_256:
+    case Intrinsic::x86_avx512_vpdpwssd_512:
+    case Intrinsic::x86_avx512_vpdpwssds_128:
+    case Intrinsic::x86_avx512_vpdpwssds_256:
+    case Intrinsic::x86_avx512_vpdpwssds_512:
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      break;
+
+      // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
+      // Precision
+      //   <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128
+      //                   (<4 x float>, <8 x bfloat>, <8 x bfloat>)
+      //   <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256
+      //                   (<8 x float>, <16 x bfloat>, <16 x bfloat>)
+      //   <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512
+      //                   (<16 x float>, <32 x bfloat>, <32 x bfloat>)
+      // handleVectorPmaddIntrinsic() currently only handles integer types.
+
     case Intrinsic::x86_sse_cmp_ss:
     case Intrinsic::x86_sse2_cmp_sd:
     case Intrinsic::x86_sse_comieq_ss:
@@ -5603,6 +5900,26 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleAVXVpermi2var(I);
       break;
 
+    // Packed Shuffle
+    //   llvm.x86.sse.pshuf.w(<1 x i64>, i8)
+    //   llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>)
+    //   llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+    //   llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
+    //   llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
+    //
+    // The following intrinsics are auto-upgraded:
+    //   llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
+    //   llvm.x86.sse2.gpshufh.w(<8 x i16>, i8)
+    //   llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
+    case Intrinsic::x86_avx2_pshuf_b:
+    case Intrinsic::x86_sse_pshuf_w:
+    case Intrinsic::x86_ssse3_pshuf_b_128:
+    case Intrinsic::x86_ssse3_pshuf_b:
+    case Intrinsic::x86_avx512_pshuf_b_512:
+      handleIntrinsicByApplyingToShadow(I, I.getIntrinsicID(),
+                                        /*trailingVerbatimArgs=*/1);
+      break;
+
     case Intrinsic::x86_avx512_mask_cvtps2dq_512: {
       handleAVX512VectorConvertFPToInt(I);
       break;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp
index 6128581..f5b6686 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp
@@ -58,7 +58,7 @@ void assignProfileData(Function &F, ArrayRef<uint64_t> RawCounters) {
         uint64_t TrueCount, FalseCount = 0;
         if (!PA.getSelectInstrProfile(*SI, TrueCount, FalseCount))
           continue;
-        setProfMetadata(F.getParent(), SI, {TrueCount, FalseCount},
+        setProfMetadata(SI, {TrueCount, FalseCount},
                         std::max(TrueCount, FalseCount));
       }
     if (succ_size(&BB) < 2)
@@ -67,7 +67,7 @@ void assignProfileData(Function &F, ArrayRef<uint64_t> RawCounters) {
     if (!PA.getOutgoingBranchWeights(BB, ProfileHolder, MaxCount))
       continue;
     assert(MaxCount > 0);
-    setProfMetadata(F.getParent(), BB.getTerminator(), ProfileHolder, MaxCount);
+    setProfMetadata(BB.getTerminator(), ProfileHolder, MaxCount);
   }
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 6f06a26..d9e850e 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1727,7 +1727,7 @@ void PGOUseFunc::setBranchWeights() {
     }
 
     if (MaxCount)
-      setProfMetadata(M, TI, EdgeCounts, MaxCount);
+      setProfMetadata(TI, EdgeCounts, MaxCount);
     else {
       // A zero MaxCount can come about when we have a BB with a positive
       // count, and whose successor blocks all have 0 count. This can happen
@@ -1801,7 +1801,7 @@ void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) {
   SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0);
   uint64_t MaxCount = std::max(SCounts[0], SCounts[1]);
   if (MaxCount)
-    setProfMetadata(F.getParent(), &SI, SCounts, MaxCount);
+    setProfMetadata(&SI, SCounts, MaxCount);
 }
 
 void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
@@ -2407,13 +2407,9 @@ static std::string getSimpleNodeName(const BasicBlock *Node) {
   return SimpleNodeName;
 }
 
-void llvm::setProfMetadata(Module *M, Instruction *TI,
-                           ArrayRef<uint64_t> EdgeCounts, uint64_t MaxCount) {
-  assert(MaxCount > 0 && "Bad max count");
-  uint64_t Scale = calculateCountScale(MaxCount);
-  SmallVector<unsigned, 4> Weights;
-  for (const auto &ECI : EdgeCounts)
-    Weights.push_back(scaleBranchCount(ECI, Scale));
+void llvm::setProfMetadata(Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
+                           uint64_t MaxCount) {
+  auto Weights = downscaleWeights(EdgeCounts, MaxCount);
 
   LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
                                            : Weights) {
@@ -2434,7 +2430,7 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
     uint64_t TotalCount =
         std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0,
                         [](uint64_t c1, uint64_t c2) { return c1 + c2; });
-    Scale = calculateCountScale(WSum);
+    uint64_t Scale = calculateCountScale(WSum);
     BranchProbability BP(scaleBranchCount(Weights[0], Scale),
                          scaleBranchCount(WSum, Scale));
     std::string BranchProbStr;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index ce1d9f1..343bec3 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -432,7 +432,7 @@ bool MemOPSizeOpt::perform(MemOp MO) {
   Updates.clear();
 
   if (MaxCount)
-    setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+    setProfMetadata(SI, CaseCounts, MaxCount);
 
   LLVM_DEBUG(dbgs() << *BB << "\n");
   LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 4edf25c..9471ae3 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -818,12 +818,12 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
         }
       }
     } else if (auto *II = dyn_cast<LifetimeIntrinsic>(I)) {
-      auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+      auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(0));
       if (!AI)
         return false;
 
       Size = GetAllocaSize(AI);
-      Dest = II->getArgOperand(1);
+      Dest = II->getArgOperand(0);
     } else if (auto *AI = dyn_cast<AllocaInst>(I)) {
       // We need to clear the types for new stack allocations (or else we might
       // read stale type information from a previous function execution).
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 84a5b02..765059d 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -95,6 +95,7 @@ add_llvm_component_library(LLVMScalarOpts
   Analysis
   Core
   InstCombine
+  ProfileData
   Support
   TransformUtils
   )
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index a7ba54f..ac59ae1 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -447,7 +447,7 @@ private:
   /// Also, collect select instructions to unfold.
   bool isCandidate(const SwitchInst *SI) {
     std::deque<std::pair<Value *, BasicBlock *>> Q;
-    SmallSet<Value *, 16> SeenValues;
+    SmallPtrSet<Value *, 16> SeenValues;
     SelectInsts.clear();
 
     Value *SICond = SI->getCondition();
@@ -511,7 +511,7 @@ private:
 
   void addToQueue(Value *Val, BasicBlock *BB,
                   std::deque<std::pair<Value *, BasicBlock *>> &Q,
-                  SmallSet<Value *, 16> &SeenValues) {
+                  SmallPtrSet<Value *, 16> &SeenValues) {
     if (SeenValues.insert(Val).second)
       Q.push_back({Val, BB});
   }
@@ -582,17 +582,15 @@ struct AllSwitchPaths {
     VisitedBlocks VB;
     // Get paths from the determinator BBs to SwitchPhiDefBB
     std::vector<ThreadingPath> PathsToPhiDef =
-        getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
+        getPathsFromStateDefMap(StateDef, SwitchPhi, VB);
     if (SwitchPhiDefBB == SwitchBlock) {
       TPaths = std::move(PathsToPhiDef);
       return;
     }
 
-    assert(MaxNumPaths >= PathsToPhiDef.size());
-    auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
     // Find and append paths from SwitchPhiDefBB to SwitchBlock.
     PathsType PathsToSwitchBB =
-        paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
+        paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1);
     if (PathsToSwitchBB.empty())
       return;
 
@@ -613,16 +611,13 @@ private:
   typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap;
   std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef,
                                                      PHINode *Phi,
-                                                     VisitedBlocks &VB,
-                                                     unsigned PathsLimit) {
+                                                     VisitedBlocks &VB) {
     std::vector<ThreadingPath> Res;
     auto *PhiBB = Phi->getParent();
     VB.insert(PhiBB);
 
     VisitedBlocks UniqueBlocks;
     for (auto *IncomingBB : Phi->blocks()) {
-      if (Res.size() >= PathsLimit)
-        break;
       if (!UniqueBlocks.insert(IncomingBB).second)
         continue;
       if (!SwitchOuterLoop->contains(IncomingBB))
@@ -658,9 +653,8 @@ private:
 
       // Direct predecessor, just add to the path.
       if (IncomingPhiDefBB == IncomingBB) {
-        assert(PathsLimit > Res.size());
-        std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap(
-            StateDef, IncomingPhi, VB, PathsLimit - Res.size());
+        std::vector<ThreadingPath> PredPaths =
+            getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
         for (ThreadingPath &Path : PredPaths) {
           Path.push_back(PhiBB);
           Res.push_back(std::move(Path));
@@ -673,17 +667,13 @@ private:
         continue;
 
       PathsType IntermediatePaths;
-      assert(PathsLimit > Res.size());
-      auto InterPathLimit = PathsLimit - Res.size();
-      IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB,
-                                /* PathDepth = */ 1, InterPathLimit);
+      IntermediatePaths =
+          paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1);
       if (IntermediatePaths.empty())
         continue;
 
-      assert(InterPathLimit >= IntermediatePaths.size());
-      auto PredPathLimit = InterPathLimit / IntermediatePaths.size();
       std::vector<ThreadingPath> PredPaths =
-          getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit);
+          getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
       for (const ThreadingPath &Path : PredPaths) {
         for (const PathType &IPath : IntermediatePaths) {
           ThreadingPath NewPath(Path);
@@ -698,7 +688,7 @@ private:
   }
 
   PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited,
-                  unsigned PathDepth, unsigned PathsLimit) {
+                  unsigned PathDepth) {
     PathsType Res;
 
     // Stop exploring paths after visiting MaxPathLength blocks
@@ -723,10 +713,8 @@ private:
 
     // Some blocks have multiple edges to the same successor, and this set
     // is used to prevent a duplicate path from being generated
-    SmallSet<BasicBlock *, 4> Successors;
+    SmallPtrSet<BasicBlock *, 4> Successors;
     for (BasicBlock *Succ : successors(BB)) {
-      if (Res.size() >= PathsLimit)
-        break;
       if (!Successors.insert(Succ).second)
         continue;
 
@@ -748,12 +736,14 @@ private:
       // coverage and compile time.
       if (LI->getLoopFor(Succ) != CurrLoop)
         continue;
-      assert(PathsLimit > Res.size());
-      PathsType SuccPaths =
-          paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size());
+
+      PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1);
       for (PathType &Path : SuccPaths) {
         Path.push_front(BB);
         Res.push_back(Path);
+        if (Res.size() >= MaxNumPaths) {
+          return Res;
+        }
       }
     }
     // This block could now be visited again from a different predecessor. Note
@@ -772,7 +762,7 @@ private:
 
     SmallVector<PHINode *, 8> Stack;
     Stack.push_back(FirstDef);
-    SmallSet<Value *, 16> SeenValues;
+    SmallPtrSet<Value *, 16> SeenValues;
 
     while (!Stack.empty()) {
       PHINode *CurPhi = Stack.pop_back_val();
@@ -965,7 +955,7 @@ private:
     DuplicateBlockMap DuplicateMap;
     DefMap NewDefs;
 
-    SmallSet<BasicBlock *, 16> BlocksToClean;
+    SmallPtrSet<BasicBlock *, 16> BlocksToClean;
     BlocksToClean.insert_range(successors(SwitchBlock));
 
     for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
@@ -994,7 +984,7 @@ private:
   /// the predecessors, and phis in the successor blocks.
   void createExitPath(DefMap &NewDefs, ThreadingPath &Path,
                       DuplicateBlockMap &DuplicateMap,
-                      SmallSet<BasicBlock *, 16> &BlocksToClean,
+                      SmallPtrSet<BasicBlock *, 16> &BlocksToClean,
                       DomTreeUpdater *DTU) {
     APInt NextState = Path.getExitValue();
     const BasicBlock *Determinator = Path.getDeterminatorBB();
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 9b87180..37004b9 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -38,6 +38,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -69,6 +70,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -543,15 +545,8 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
   };
 
   // Insert an unlinked dbg.assign intrinsic for the dead fragment after each
-  // overlapping dbg.assign intrinsic. The loop invalidates the iterators
-  // returned by getAssignmentMarkers so save a copy of the markers to iterate
-  // over.
-  auto LinkedRange = at::getAssignmentMarkers(Inst);
-  SmallVector<DbgVariableRecord *> LinkedDVRAssigns =
-      at::getDVRAssignmentMarkers(Inst);
-  SmallVector<DbgAssignIntrinsic *> Linked(LinkedRange.begin(),
-                                           LinkedRange.end());
-  auto InsertAssignForOverlap = [&](auto *Assign) {
+  // overlapping dbg.assign intrinsic.
+  for (DbgVariableRecord *Assign : at::getDVRAssignmentMarkers(Inst)) {
     std::optional<DIExpression::FragmentInfo> NewFragment;
     if (!at::calculateFragmentIntersect(DL, OriginalDest, DeadSliceOffsetInBits,
                                         DeadSliceSizeInBits, Assign,
@@ -561,11 +556,11 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
       // cautious and unlink the whole assignment from the store.
       Assign->setKillAddress();
       Assign->setAssignId(GetDeadLink());
-      return;
+      continue;
     }
     // No intersect.
     if (NewFragment->SizeInBits == 0)
-      return;
+      continue;
 
     // Fragments overlap: insert a new dbg.assign for this dead part.
     auto *NewAssign = static_cast<decltype(Assign)>(Assign->clone());
@@ -574,9 +569,7 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
     if (NewFragment)
       SetDeadFragExpr(NewAssign, *NewFragment);
     NewAssign->setKillAddress();
-  };
-  for_each(Linked, InsertAssignForOverlap);
-  for_each(LinkedDVRAssigns, InsertAssignForOverlap);
+  }
 }
 
 /// Update the attributes given that a memory access is updated (the
@@ -1363,7 +1356,7 @@ struct DSEState {
     if (auto *CB = dyn_cast<CallBase>(I)) {
       if (CB->getIntrinsicID() == Intrinsic::lifetime_end)
         return {
-            std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)};
+            std::make_pair(MemoryLocation::getForArgument(CB, 0, &TLI), false)};
       if (Value *FreedOp = getFreedOperand(CB, &TLI))
         return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)};
     }
@@ -2666,3 +2659,79 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
   PA.preserve<LoopAnalysis>();
   return PA;
 }
+
+namespace {
+
+/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
+class DSELegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  DSELegacyPass() : FunctionPass(ID) {
+    initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+    PostDominatorTree &PDT =
+        getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+    bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+
+#ifdef LLVM_ENABLE_STATS
+    if (AreStatisticsEnabled())
+      for (auto &I : instructions(F))
+        NumRemainingStores += isa<StoreInst>(&I);
+#endif
+
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+  }
+};
+
+} // end anonymous namespace
+
+char DSELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
+                    false)
+
+namespace llvm {
+LLVM_ABI FunctionPass *createDeadStoreEliminationPass() {
+  return new DSELegacyPass();
+}
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 7704e49..4baa3b3 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -978,7 +978,7 @@ static bool IsValueFullyAvailableInBlock(
   unsigned NumNewNewSpeculativelyAvailableBBs = 0;
 
 #ifndef NDEBUG
-  SmallSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs;
+  SmallPtrSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs;
   SmallVector<BasicBlock *, 32> AvailableBBs;
 #endif
 
@@ -1222,7 +1222,7 @@ static bool liesBetween(const Instruction *From, Instruction *Between,
                         const Instruction *To, const DominatorTree *DT) {
   if (From->getParent() == Between->getParent())
     return DT->dominates(From, Between);
-  SmallSet<BasicBlock *, 1> Exclusion;
+  SmallPtrSet<BasicBlock *, 1> Exclusion;
   Exclusion.insert(Between->getParent());
   return !isPotentiallyReachable(From, To, &Exclusion, DT);
 }
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index 3ba5b79..d99f1eb 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -642,9 +642,9 @@ Value *GuardWideningImpl::freezeAndPush(Value *Orig,
     return FI;
   }
 
-  SmallSet<Value *, 16> Visited;
+  SmallPtrSet<Value *, 16> Visited;
   SmallVector<Value *, 16> Worklist;
-  SmallSet<Instruction *, 16> DropPoisonFlags;
+  SmallPtrSet<Instruction *, 16> DropPoisonFlags;
   SmallVector<Value *, 16> NeedFreeze;
   DenseMap<Value *, FreezeInst *> CacheOfFreezes;
 
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 334c911..6720cb1 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1613,7 +1613,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     if (CurrMaxExit == MaxBECount)
       SkipLastIter = true;
   };
-  SmallSet<const SCEV *, 8> DominatingExactExitCounts;
+  SmallPtrSet<const SCEV *, 8> DominatingExactExitCounts;
   for (BasicBlock *ExitingBB : ExitingBlocks) {
     const SCEV *ExactExitCount = SE->getExitCount(L, ExitingBB);
     const SCEV *MaxExitCount = SE->getExitCount(
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 85ee824..a097d33 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -434,7 +434,7 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
     NewV = NewV->stripPointerCasts();
     Function *NewDecl = Intrinsic::getOrInsertDeclaration(
         M, II->getIntrinsicID(), {NewV->getType()});
-    II->setArgOperand(1, NewV);
+    II->setArgOperand(0, NewV);
     II->setCalledFunction(NewDecl);
     return true;
   }
@@ -491,7 +491,7 @@ void InferAddressSpacesImpl::collectRewritableIntrinsicOperands(
   }
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end: {
-    appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(1),
+    appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
                                                  PostorderStack, Visited);
     break;
   }
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 0ddc231..e9bf59c 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -58,14 +58,55 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
   }
 
   // Compute alignment from known bits.
+  auto InferFromKnownBits = [&](Instruction &I, Value *PtrOp) {
+    KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
+    unsigned TrailZ =
+        std::min(Known.countMinTrailingZeros(), +Value::MaxAlignmentExponent);
+    return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+  };
+
+  // Propagate alignment between loads and stores that originate from the
+  // same base pointer.
+  DenseMap<Value *, Align> BestBasePointerAligns;
+  auto InferFromBasePointer = [&](Value *PtrOp, Align LoadStoreAlign) {
+    APInt OffsetFromBase(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+    PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true);
+    // Derive the base pointer alignment from the load/store alignment
+    // and the offset from the base pointer.
+    Align BasePointerAlign =
+        commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
+
+    auto [It, Inserted] =
+        BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
+    if (!Inserted) {
+      // If the stored base pointer alignment is better than the
+      // base pointer alignment we derived, we may be able to use it
+      // to improve the load/store alignment. If not, store the
+      // improved base pointer alignment for future iterations.
+      if (It->second > BasePointerAlign) {
+        Align BetterLoadStoreAlign =
+            commonAlignment(It->second, OffsetFromBase.getLimitedValue());
+        return BetterLoadStoreAlign;
+      }
+      It->second = BasePointerAlign;
+    }
+    return LoadStoreAlign;
+  };
+
   for (BasicBlock &BB : F) {
+    // We need to reset the map for each block because alignment information
+    // can only be propagated from instruction A to B if A dominates B.
+    // This is because control flow (and exception throwing) could be dependent
+    // on the address (and its alignment) at runtime. Some sort of dominator
+    // tree approach could be better, but doing a simple forward pass through a
+    // single basic block is correct too.
+    BestBasePointerAligns.clear();
+
     for (Instruction &I : BB) {
       Changed |= tryToImproveAlign(
           DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
-            KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
-            unsigned TrailZ = std::min(Known.countMinTrailingZeros(),
-                                       +Value::MaxAlignmentExponent);
-            return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+            return std::max(InferFromKnownBits(I, PtrOp),
+                            InferFromBasePointer(PtrOp, OldAlign));
           });
     }
   }
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 7f99cd2..9d915d0 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -7,14 +7,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/JumpTableToSwitch.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/CtxProfAnalysis.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <limits>
 
 using namespace llvm;
 
@@ -33,6 +42,8 @@ static cl::opt<unsigned> FunctionSizeThreshold(
              "or equal than this threshold."),
     cl::init(50));
 
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
 #define DEBUG_TYPE "jump-table-to-switch"
 
 namespace {
@@ -90,9 +101,11 @@ static std::optional<JumpTableTy> parseJumpTable(GetElementPtrInst *GEP,
   return JumpTable;
 }
 
-static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
-                                  DomTreeUpdater &DTU,
-                                  OptimizationRemarkEmitter &ORE) {
+static BasicBlock *
+expandToSwitch(CallBase *CB, const JumpTableTy &JT, DomTreeUpdater &DTU,
+               OptimizationRemarkEmitter &ORE,
+               llvm::function_ref<GlobalValue::GUID(const Function &)>
+                   GetGuidForFunction) {
   const bool IsVoid = CB->getType() == Type::getVoidTy(CB->getContext());
 
   SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
@@ -115,7 +128,30 @@ static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
   IRBuilder<> BuilderTail(CB);
   PHINode *PHI =
       IsVoid ? nullptr : BuilderTail.CreatePHI(CB->getType(), JT.Funcs.size());
+  const auto *ProfMD = CB->getMetadata(LLVMContext::MD_prof);
+
+  SmallVector<uint64_t> BranchWeights;
+  DenseMap<GlobalValue::GUID, uint64_t> GuidToCounter;
+  const bool HadProfile = isValueProfileMD(ProfMD);
+  if (HadProfile) {
+    // The assumptions, coming in, are that the functions in JT.Funcs are
+    // defined in this module (from parseJumpTable).
+    assert(llvm::all_of(
+        JT.Funcs, [](const Function *F) { return F && !F->isDeclaration(); }));
+    BranchWeights.reserve(JT.Funcs.size() + 1);
+    // The first is the default target, which is the unreachable block created
+    // above.
+    BranchWeights.push_back(0U);
+    uint64_t TotalCount = 0;
+    auto Targets = getValueProfDataFromInst(
+        *CB, InstrProfValueKind::IPVK_IndirectCallTarget,
+        std::numeric_limits<uint32_t>::max(), TotalCount);
 
+    for (const auto &[G, C] : Targets) {
+      [[maybe_unused]] auto It = GuidToCounter.insert({G, C});
+      assert(It.second);
+    }
+  }
   for (auto [Index, Func] : llvm::enumerate(JT.Funcs)) {
     BasicBlock *B = BasicBlock::Create(Func->getContext(),
                                        "call." + Twine(Index), &F, Tail);
@@ -123,10 +159,19 @@ static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
     DTUpdates.push_back({DominatorTree::Insert, B, Tail});
 
     CallBase *Call = cast<CallBase>(CB->clone());
+    // The MD_prof metadata (VP kind), if it existed, can be dropped, it doesn't
+    // make sense on a direct call. Note that the values are used for the branch
+    // weights of the switch.
+    Call->setMetadata(LLVMContext::MD_prof, nullptr);
     Call->setCalledFunction(Func);
     Call->insertInto(B, B->end());
     Switch->addCase(
         cast<ConstantInt>(ConstantInt::get(JT.Index->getType(), Index)), B);
+    GlobalValue::GUID FctID = GetGuidForFunction(*Func);
+    // It'd be OK to _not_ find target functions in GuidToCounter, e.g. suppose
+    // just some of the jump targets are taken (for the given profile).
+    BranchWeights.push_back(FctID == 0U ? 0U
+                                        : GuidToCounter.lookup_or(FctID, 0U));
     BranchInst::Create(Tail, B);
     if (PHI)
       PHI->addIncoming(Call, B);
@@ -136,6 +181,13 @@ static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
     return OptimizationRemark(DEBUG_TYPE, "ReplacedJumpTableWithSwitch", CB)
            << "expanded indirect call into switch";
   });
+  if (HadProfile && !ProfcheckDisableMetadataFixes) {
+    // At least one of the targets must've been taken.
+    assert(llvm::any_of(BranchWeights, [](uint64_t V) { return V != 0; }));
+    setBranchWeights(*Switch, downscaleWeights(BranchWeights),
+                     /*IsExpected=*/false);
+  } else
+    setExplicitlyUnknownBranchWeights(*Switch);
   if (PHI)
     CB->replaceAllUsesWith(PHI);
   CB->eraseFromParent();
@@ -150,6 +202,15 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
   PostDominatorTree *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
   DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy);
   bool Changed = false;
+  InstrProfSymtab Symtab;
+  if (auto E = Symtab.create(*F.getParent()))
+    F.getContext().emitError(
+        "Could not create indirect call table, likely corrupted IR" +
+        toString(std::move(E)));
+  DenseMap<const Function *, GlobalValue::GUID> FToGuid;
+  for (const auto &[G, FPtr] : Symtab.getIDToNameMap())
+    FToGuid.insert({FPtr, G});
+
   for (BasicBlock &BB : make_early_inc_range(F)) {
     BasicBlock *CurrentBB = &BB;
     while (CurrentBB) {
@@ -170,7 +231,12 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
         std::optional<JumpTableTy> JumpTable = parseJumpTable(GEP, PtrTy);
         if (!JumpTable)
           continue;
-        SplittedOutTail = expandToSwitch(Call, *JumpTable, DTU, ORE);
+        SplittedOutTail = expandToSwitch(
+            Call, *JumpTable, DTU, ORE, [&](const Function &Fct) {
+              if (Fct.getMetadata(AssignGUIDPass::GUIDMetadataName))
+                return AssignGUIDPass::getGUID(Fct);
+              return FToGuid.lookup_or(&Fct, 0U);
+            });
         Changed = true;
         break;
       }
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index c3f80f9..e157cc9 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -169,6 +169,8 @@ cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
              "number of accesses allowed to be present in a loop in order to "
              "enable memory promotion."));
 
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedOrFoldableInLoop(const Instruction &I, const Loop *CurLoop,
                                       const LoopSafetyInfo *SafetyInfo,
@@ -472,7 +474,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
                            MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode,
-                           LicmAllowSpeculation);
+                           LicmAllowSpeculation, HasCoroSuspendInst);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -857,9 +859,18 @@ public:
     }
 
     // Now finally clone BI.
-    ReplaceInstWithInst(
-        HoistTarget->getTerminator(),
-        BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition()));
+    auto *NewBI =
+        BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition(),
+                           HoistTarget->getTerminator()->getIterator());
+    HoistTarget->getTerminator()->eraseFromParent();
+    // md_prof should also come from the original branch - since the
+    // condition was hoisted, the branch probabilities shouldn't change.
+    if (!ProfcheckDisableMetadataFixes)
+      NewBI->copyMetadata(*BI, {LLVMContext::MD_prof});
+    // FIXME: Issue #152767: debug info should also be the same as the
+    // original branch, **if** the user explicitly indicated that.
+    NewBI->setDebugLoc(HoistTarget->getTerminator()->getDebugLoc());
+
     ++NumClonedBranches;
 
     assert(CurLoop->getLoopPreheader() &&
@@ -881,7 +892,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                        ICFLoopSafetyInfo *SafetyInfo,
                        SinkAndHoistLICMFlags &Flags,
                        OptimizationRemarkEmitter *ORE, bool LoopNestMode,
-                       bool AllowSpeculation) {
+                       bool AllowSpeculation, bool HasCoroSuspendInst) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
          CurLoop != nullptr && SafetyInfo != nullptr &&
@@ -914,11 +925,11 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       // TODO: It may be safe to hoist if we are hoisting to a conditional block
       // and we have accurately duplicated the control flow from the loop header
       // to that block.
-      if (CurLoop->hasLoopInvariantOperands(&I) &&
+      if (CurLoop->hasLoopInvariantOperands(&I, HasCoroSuspendInst) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) &&
-          isSafeToExecuteUnconditionally(
-              I, DT, TLI, CurLoop, SafetyInfo, ORE,
-              Preheader->getTerminator(), AC, AllowSpeculation)) {
+          isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, ORE,
+                                         Preheader->getTerminator(), AC,
+                                         AllowSpeculation)) {
         hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
               MSSAU, SE, ORE);
         HoistedInstructions.push_back(&I);
@@ -964,7 +975,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop);
       };
       if ((IsInvariantStart(I) || isGuard(&I)) &&
-          CurLoop->hasLoopInvariantOperands(&I) &&
+          CurLoop->hasLoopInvariantOperands(&I, HasCoroSuspendInst) &&
           MustExecuteWithoutWritesBefore(I)) {
         hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
               MSSAU, SE, ORE);
@@ -1230,11 +1241,16 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (Behavior.doesNotAccessMemory())
       return true;
     if (Behavior.onlyReadsMemory()) {
+      // Might have stale MemoryDef for call that was later inferred to be
+      // read-only.
+      auto *MU = dyn_cast<MemoryUse>(MSSA->getMemoryAccess(CI));
+      if (!MU)
+        return false;
+
       // If we can prove there are no writes to the memory read by the call, we
       // can hoist or sink.
       return !pointerInvalidatedByLoop(
-          MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I, Flags,
-          /*InvariantGroup=*/false);
+          MSSA, MU, CurLoop, I, Flags, /*InvariantGroup=*/false);
     }
 
     if (Behavior.onlyWritesMemory()) {
@@ -1688,8 +1704,12 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
       // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
       // time in isGuaranteedToExecute if we don't actually have anything to
       // drop.  It is a compile time optimization, not required for correctness.
-      !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop))
-    I.dropUBImplyingAttrsAndMetadata();
+      !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) {
+    if (ProfcheckDisableMetadataFixes)
+      I.dropUBImplyingAttrsAndMetadata();
+    else
+      I.dropUBImplyingAttrsAndMetadata({LLVMContext::MD_prof});
+  }
 
   if (isa<PHINode>(I))
     // Move the new node to the end of the phi list in the destination block.
@@ -2856,7 +2876,7 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
   bool LVInRHS = L.isLoopInvariant(BO->getOperand(0));
   auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(LVInRHS));
   if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative() ||
-      BO0->hasNUsesOrMore(3))
+      BO0->hasNUsesOrMore(BO0->getType()->isIntegerTy() ? 2 : 3))
     return false;
 
   Value *LV = BO0->getOperand(0);
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 0ac1a15..27d3004 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -502,8 +502,10 @@ public:
     SmallVector<int, 8> PtrToPartitions(N);
     for (unsigned I = 0; I < N; ++I) {
       Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
-      auto Instructions =
-          LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
+      auto Instructions = LAI.getInstructionsForAccess(Ptr, /* IsWrite */ true);
+      auto ReadInstructions =
+          LAI.getInstructionsForAccess(Ptr, /* IsWrite */ false);
+      Instructions.append(ReadInstructions.begin(), ReadInstructions.end());
 
       int &Partition = PtrToPartitions[I];
       // First set it to uninitialized.
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index b3bffeb..5795c76 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -263,6 +263,7 @@ static bool isUniformShape(Value *V) {
     case llvm::Instruction::FPExt:
       return true;
     case llvm::Instruction::AddrSpaceCast:
+    case CastInst::PtrToAddr:
     case CastInst::PtrToInt:
     case CastInst::IntToPtr:
       return false;
@@ -1208,7 +1209,7 @@ public:
     //
     // For verification, we keep track of where we changed uses to poison in
     // PoisonedInsts and then check that we in fact remove them.
-    SmallSet<Instruction *, 16> PoisonedInsts;
+    SmallPtrSet<Instruction *, 16> PoisonedInsts;
     for (auto *Inst : reverse(ToRemove)) {
       for (Use &U : llvm::make_early_inc_range(Inst->uses())) {
         if (auto *Poisoned = dyn_cast<Instruction>(U.getUser()))
@@ -2166,7 +2167,7 @@ public:
 
         // If the loads don't alias the lifetime.end, it won't interfere with
         // fusion.
-        MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 1, nullptr);
+        MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 0, nullptr);
         if (!EndLoc.Ptr)
           continue;
         if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc))
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 79721dc..e043d07 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -915,7 +915,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   // move the bitcast as well, which we don't handle.
   if (SkippedLifetimeStart) {
     auto *LifetimeArg =
-        dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(1));
+        dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(0));
     if (LifetimeArg && LifetimeArg->getParent() == C->getParent() &&
         C->comesBefore(LifetimeArg))
       return false;
@@ -1010,7 +1010,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
       // Lifetime of srcAlloca ends at lifetime.end.
       if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
         if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
-            II->getArgOperand(1) == srcAlloca)
+            II->getArgOperand(0) == srcAlloca)
           break;
       }
 
@@ -1393,7 +1393,7 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
   if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst()))
     if (II->getIntrinsicID() == Intrinsic::lifetime_start)
       if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V)))
-        return II->getArgOperand(1) == Alloca;
+        return II->getArgOperand(0) == Alloca;
 
   return false;
 }
@@ -1530,7 +1530,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   // to remove them.
 
   SmallVector<Instruction *, 4> LifetimeMarkers;
-  SmallSet<Instruction *, 4> AAMetadataInstrs;
+  SmallPtrSet<Instruction *, 4> AAMetadataInstrs;
   bool SrcNotDom = false;
 
   auto CaptureTrackingWithModRef =
@@ -1540,7 +1540,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
     Worklist.push_back(AI);
     unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
     Worklist.reserve(MaxUsesToExplore);
-    SmallSet<const Use *, 20> Visited;
+    SmallPtrSet<const Use *, 20> Visited;
     while (!Worklist.empty()) {
       Instruction *I = Worklist.pop_back_val();
       for (const Use &U : I->uses()) {
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 1a52af1..9d4fb79 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -651,7 +651,7 @@ class NewGVN {
   BitVector TouchedInstructions;
 
   DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
-  mutable DenseMap<const IntrinsicInst *, const Value *> PredicateSwapChoice;
+  mutable DenseMap<const BitCastInst *, const Value *> PredicateSwapChoice;
 
 #ifndef NDEBUG
   // Debugging for how many times each block and instruction got processed.
@@ -819,7 +819,7 @@ private:
                                                  BasicBlock *PHIBlock) const;
   const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
   ExprResult performSymbolicCmpEvaluation(Instruction *) const;
-  ExprResult performSymbolicPredicateInfoEvaluation(IntrinsicInst *) const;
+  ExprResult performSymbolicPredicateInfoEvaluation(BitCastInst *) const;
 
   // Congruence finding.
   bool someEquivalentDominates(const Instruction *, const Instruction *) const;
@@ -841,7 +841,7 @@ private:
   unsigned int getRank(const Value *) const;
   bool shouldSwapOperands(const Value *, const Value *) const;
   bool shouldSwapOperandsForPredicate(const Value *, const Value *,
-                                      const IntrinsicInst *I) const;
+                                      const BitCastInst *I) const;
 
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
@@ -1013,9 +1013,9 @@ void NewGVN::deleteExpression(const Expression *E) const {
 
 // If V is a predicateinfo copy, get the thing it is a copy of.
 static Value *getCopyOf(const Value *V) {
-  if (auto *II = dyn_cast<IntrinsicInst>(V))
-    if (II->getIntrinsicID() == Intrinsic::ssa_copy)
-      return II->getOperand(0);
+  if (auto *BC = dyn_cast<BitCastInst>(V))
+    if (BC->getType() == BC->getOperand(0)->getType())
+      return BC->getOperand(0);
   return nullptr;
 }
 
@@ -1535,7 +1535,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
 
   if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
     if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
-      auto *LifetimePtr = II->getOperand(1);
+      auto *LifetimePtr = II->getOperand(0);
       if (LoadPtr == lookupOperandLeader(LifetimePtr) ||
           AA->isMustAlias(LoadPtr, LifetimePtr))
         return createConstantExpression(UndefValue::get(LoadType));
@@ -1604,7 +1604,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
 }
 
 NewGVN::ExprResult
-NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const {
+NewGVN::performSymbolicPredicateInfoEvaluation(BitCastInst *I) const {
   auto *PI = PredInfo->getPredicateInfoFor(I);
   if (!PI)
     return ExprResult::none();
@@ -1647,13 +1647,8 @@ NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const {
 NewGVN::ExprResult NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
   auto *CI = cast<CallInst>(I);
   if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-    // Intrinsics with the returned attribute are copies of arguments.
-    if (auto *ReturnedValue = II->getReturnedArgOperand()) {
-      if (II->getIntrinsicID() == Intrinsic::ssa_copy)
-        if (auto Res = performSymbolicPredicateInfoEvaluation(II))
-          return Res;
+    if (auto *ReturnedValue = II->getReturnedArgOperand())
       return ExprResult::some(createVariableOrConstant(ReturnedValue));
-    }
   }
 
   // FIXME: Currently the calls which may access the thread id may
@@ -2032,6 +2027,12 @@ NewGVN::performSymbolicEvaluation(Instruction *I,
     E = performSymbolicLoadEvaluation(I);
     break;
   case Instruction::BitCast:
+    // Intrinsics with the returned attribute are copies of arguments.
+    if (I->getType() == I->getOperand(0)->getType())
+      if (auto Res =
+              performSymbolicPredicateInfoEvaluation(cast<BitCastInst>(I)))
+        return Res;
+    [[fallthrough]];
   case Instruction::AddrSpaceCast:
   case Instruction::Freeze:
     return createExpression(I);
@@ -4075,8 +4076,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
               if (DominatingLeader != Def) {
                 // Even if the instruction is removed, we still need to update
                 // flags/metadata due to downstreams users of the leader.
-                if (!match(DefI, m_Intrinsic<Intrinsic::ssa_copy>()))
-                  patchReplacementInstruction(DefI, DominatingLeader);
+                patchReplacementInstruction(DefI, DominatingLeader);
 
                 SmallVector<DbgVariableRecord *> DVRUsers;
                 findDbgUsers(DefI, DVRUsers);
@@ -4116,10 +4116,14 @@ bool NewGVN::eliminateInstructions(Function &F) {
 
           Value *DominatingLeader = EliminationStack.back();
 
-          auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
-          bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy;
-          if (isSSACopy)
-            DominatingLeader = II->getOperand(0);
+          Instruction *SSACopy = nullptr;
+          if (auto *BC = dyn_cast<BitCastInst>(DominatingLeader)) {
+            if (BC->getType() == BC->getOperand(0)->getType() &&
+                PredInfo->getPredicateInfoFor(DominatingLeader)) {
+              SSACopy = BC;
+              DominatingLeader = BC->getOperand(0);
+            }
+          }
 
           // Don't replace our existing users with ourselves.
           if (U->get() == DominatingLeader)
@@ -4145,12 +4149,12 @@ bool NewGVN::eliminateInstructions(Function &F) {
             ProbablyDead.erase(cast<Instruction>(DominatingLeader));
           // For copy instructions, we use their operand as a leader,
           // which means we remove a user of the copy and it may become dead.
-          if (isSSACopy) {
-            auto It = UseCounts.find(II);
+          if (SSACopy) {
+            auto It = UseCounts.find(SSACopy);
             if (It != UseCounts.end()) {
               unsigned &IIUseCount = It->second;
               if (--IIUseCount == 0)
-                ProbablyDead.insert(II);
+                ProbablyDead.insert(SSACopy);
             }
           }
           ++LeaderUseCount;
@@ -4251,7 +4255,7 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
 }
 
 bool NewGVN::shouldSwapOperandsForPredicate(const Value *A, const Value *B,
-                                            const IntrinsicInst *I) const {
+                                            const BitCastInst *I) const {
   if (shouldSwapOperands(A, B)) {
     PredicateSwapChoice[I] = B;
     return true;
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 343da5b2..ba58b8e 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -878,7 +878,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
 // only that it mostly looks like one.
 static bool isLoadCombineCandidate(Instruction *Or) {
   SmallVector<Instruction *, 8> Worklist;
-  SmallSet<Instruction *, 8> Visited;
+  SmallPtrSet<Instruction *, 8> Visited;
 
   auto Enqueue = [&](Value *V) {
     auto *I = dyn_cast<Instruction>(V);
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index d9805d8..8b15445 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -2309,8 +2309,9 @@ chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain,
 
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
       // Cost of the address calculation
-      Type *ValTy = GEP->getSourceElementType();
-      Cost += TTI.getAddressComputationCost(ValTy);
+      Cost += TTI.getAddressComputationCost(
+          GEP->getType(), nullptr, nullptr,
+          TargetTransformInfo::TCK_SizeAndLatency);
 
       // And cost of the GEP itself
       // TODO: Use TTI->getGEPCost here (it exists, but appears to be not
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 8be2f78..feee794 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -53,12 +55,15 @@ STATISTIC(NumInstReplaced,
 // runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
 // and return true if the function was modified.
 static bool runSCCP(Function &F, const DataLayout &DL,
-                    const TargetLibraryInfo *TLI, DomTreeUpdater &DTU) {
+                    const TargetLibraryInfo *TLI, DominatorTree &DT,
+                    AssumptionCache &AC) {
   LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
   SCCPSolver Solver(
       DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; },
       F.getContext());
 
+  Solver.addPredicateInfo(F, DT, AC);
+
   // While we don't do any actual inter-procedural analysis, still track
   // return values so we can infer attributes.
   if (canTrackReturnsInterprocedurally(&F))
@@ -101,6 +106,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
   }
 
   // Remove unreachable blocks and non-feasible edges.
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   for (BasicBlock *DeadBB : BlocksToErase)
     NumInstRemoved += changeToUnreachable(&*DeadBB->getFirstNonPHIIt(),
                                           /*PreserveLCSSA=*/false, &DTU);
@@ -113,6 +119,8 @@ static bool runSCCP(Function &F, const DataLayout &DL,
     if (!DeadBB->hasAddressTaken())
       DTU.deleteBB(DeadBB);
 
+  Solver.removeSSACopies(F);
+
   Solver.inferReturnAttributes();
 
   return MadeChanges;
@@ -121,9 +129,9 @@ static bool runSCCP(Function &F, const DataLayout &DL,
 PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
   const DataLayout &DL = F.getDataLayout();
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-  if (!runSCCP(F, DL, &TLI, DTU))
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  if (!runSCCP(F, DL, &TLI, DT, AC))
     return PreservedAnalyses::all();
 
   auto PA = PreservedAnalyses();
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 03d9f32..06a92bd 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -320,15 +320,6 @@ static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
                        DVR->getDebugLoc().getInlinedAt());
 }
 
-DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) {
-  (void)Unused;
-  return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P));
-}
-DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) {
-  (void)Unused;
-  return static_cast<DbgAssignIntrinsic *>(cast<Instruction *>(P));
-}
-
 /// Find linked dbg.assign and generate a new one with the correct
 /// FragmentInfo. Link Inst to the new dbg.assign.  If Value is nullptr the
 /// value component is copied from the old dbg.assign to the new.
@@ -348,10 +339,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
                              uint64_t SliceSizeInBits, Instruction *OldInst,
                              Instruction *Inst, Value *Dest, Value *Value,
                              const DataLayout &DL) {
-  auto MarkerRange = at::getAssignmentMarkers(OldInst);
   auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
   // Nothing to do if OldInst has no linked dbg.assign intrinsics.
-  if (MarkerRange.empty() && DVRAssignMarkerRange.empty())
+  if (DVRAssignMarkerRange.empty())
     return;
 
   LLVM_DEBUG(dbgs() << "  migrateDebugInfo\n");
@@ -435,11 +425,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
     }
 
     ::Value *NewValue = Value ? Value : DbgAssign->getValue();
-    auto *NewAssign = UnwrapDbgInstPtr(
+    DbgVariableRecord *NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>(
         DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
                             Dest, DIExpression::get(Expr->getContext(), {}),
-                            DbgAssign->getDebugLoc()),
-        DbgAssign);
+                            DbgAssign->getDebugLoc())));
 
     // If we've updated the value but the original dbg.assign has an arglist
     // then kill it now - we can't use the requested new value.
@@ -1260,10 +1249,7 @@ private:
       return PI.setAborted(&II);
 
     if (II.isLifetimeStartOrEnd()) {
-      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
-      uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
-                               Length->getLimitedValue());
-      insertUse(II, Offset, Size, true);
+      insertUse(II, Offset, AllocSize, true);
       return;
     }
 
@@ -3235,8 +3221,7 @@ private:
       // In theory we should call migrateDebugInfo here. However, we do not
       // emit dbg.assign intrinsics for mem intrinsics storing through non-
       // constant geps, or storing a variable number of bytes.
-      assert(at::getAssignmentMarkers(&II).empty() &&
-             at::getDVRAssignmentMarkers(&II).empty() &&
+      assert(at::getDVRAssignmentMarkers(&II).empty() &&
              "AT: Unexpected link to non-const GEP");
       deleteIfTriviallyDead(OldPtr);
       return false;
@@ -3385,13 +3370,11 @@ private:
       Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
       if (IsDest) {
         // Update the address component of linked dbg.assigns.
-        auto UpdateAssignAddress = [&](auto *DbgAssign) {
+        for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
           if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
               DbgAssign->getAddress() == II.getDest())
             DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
-        };
-        for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress);
-        for_each(at::getDVRAssignmentMarkers(&II), UpdateAssignAddress);
+        }
         II.setDest(AdjustedPtr);
         II.setDestAlignment(SliceAlign);
       } else {
@@ -3614,30 +3597,14 @@ private:
       return true;
     }
 
-    assert(II.getArgOperand(1) == OldPtr);
-    // Lifetime intrinsics are only promotable if they cover the whole alloca.
-    // Therefore, we drop lifetime intrinsics which don't cover the whole
-    // alloca.
-    // (In theory, intrinsics which partially cover an alloca could be
-    // promoted, but PromoteMemToReg doesn't handle that case.)
-    // FIXME: Check whether the alloca is promotable before dropping the
-    // lifetime intrinsics?
-    if (NewBeginOffset != NewAllocaBeginOffset ||
-        NewEndOffset != NewAllocaEndOffset)
-      return true;
-
-    ConstantInt *Size =
-        ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
-                         NewEndOffset - NewBeginOffset);
-    // Lifetime intrinsics always expect an i8* so directly get such a pointer
-    // for the new alloca slice.
+    assert(II.getArgOperand(0) == OldPtr);
     Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
     Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
     Value *New;
     if (II.getIntrinsicID() == Intrinsic::lifetime_start)
-      New = IRB.CreateLifetimeStart(Ptr, Size);
+      New = IRB.CreateLifetimeStart(Ptr);
     else
-      New = IRB.CreateLifetimeEnd(Ptr, Size);
+      New = IRB.CreateLifetimeEnd(Ptr);
 
     (void)New;
     LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
@@ -4005,8 +3972,7 @@ private:
                          Store->getPointerOperand(), Store->getValueOperand(),
                          DL);
       } else {
-        assert(at::getAssignmentMarkers(Store).empty() &&
-               at::getDVRAssignmentMarkers(Store).empty() &&
+        assert(at::getDVRAssignmentMarkers(Store).empty() &&
                "AT: unexpected debug.assign linked to store through "
                "unbounded GEP");
       }
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index c7e4a3e..032a3a7 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -37,6 +37,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeMergeICmpsLegacyPassPass(Registry);
   initializeNaryReassociateLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
+  initializeDSELegacyPassPass(Registry);
   initializeReassociateLegacyPassPass(Registry);
   initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
   initializeSROALegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6ffe841..fc96589 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -294,6 +294,10 @@ private:
   bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
                     bool NonNegative);
 
+  /// Analyze XOR instruction to extract disjoint constant bits that behave
+  /// like addition operations for improved address mode folding.
+  APInt extractDisjointBitsFromXor(BinaryOperator *XorInst);
+
   /// The path from the constant offset to the old GEP index. e.g., if the GEP
   /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
   /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
@@ -596,6 +600,9 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
     // Trace into subexpressions for more hoisting opportunities.
     if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
       ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+    // Handle XOR with disjoint bits that can be treated as addition.
+    else if (BO->getOpcode() == Instruction::Xor)
+      ConstantOffset = extractDisjointBitsFromXor(BO);
   } else if (isa<TruncInst>(V)) {
     ConstantOffset =
         find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
@@ -708,11 +715,20 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
   Value *NextInChain = removeConstOffset(ChainIndex - 1);
   Value *TheOther = BO->getOperand(1 - OpNo);
 
-  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
-  // sub-expression to be just TheOther.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
-    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
-      return TheOther;
+    if (CI->isZero()) {
+      // Custom XOR handling for disjoint bits - preserves original XOR
+      // with non-disjoint constant bits.
+      // TODO: The design should be updated to support partial constant
+      // extraction.
+      if (BO->getOpcode() == Instruction::Xor)
+        return BO;
+
+      // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+      // sub-expression to be just TheOther.
+      if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+        return TheOther;
+    }
   }
 
   BinaryOperator::BinaryOps NewOp = BO->getOpcode();
@@ -743,6 +759,67 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
   return NewBO;
 }
 
+/// Analyze XOR instruction to extract disjoint constant bits for address
+/// folding
+///
+/// This function identifies bits in an XOR constant operand that are disjoint
+/// from the base operand's known set bits. For these disjoint bits, XOR behaves
+/// identically to addition, allowing us to extract them as constant offsets
+/// that can be folded into addressing modes.
+///
+/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) +
+/// DisjointBits` where DisjointBits = Const & KnownZeros(Base)
+///
+/// Example with ptr having known-zero low bit:
+///   Original: `xor %ptr, 3`    ; 3 = 0b11
+///   Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01
+///   Result:   `(xor %ptr, 2) + 1` where 1 can be folded into address mode
+///
+/// \param XorInst The XOR binary operator to analyze
+/// \return APInt containing the disjoint bits that can be extracted as offset,
+///         or zero if no disjoint bits exist
+APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
+    BinaryOperator *XorInst) {
+  assert(XorInst && XorInst->getOpcode() == Instruction::Xor &&
+         "Expected XOR instruction");
+
+  const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits();
+  Value *BaseOperand;
+  ConstantInt *XorConstant;
+
+  // Match pattern: xor BaseOperand, Constant.
+  if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
+    return APInt::getZero(BitWidth);
+
+  // Compute known bits for the base operand.
+  const SimplifyQuery SQ(DL);
+  const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ);
+  const APInt &ConstantValue = XorConstant->getValue();
+
+  // Identify disjoint bits: constant bits that are known zero in base.
+  const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero;
+
+  // Early exit if no disjoint bits found.
+  if (DisjointBits.isZero())
+    return APInt::getZero(BitWidth);
+
+  // Compute the remaining non-disjoint bits that stay in the XOR.
+  const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
+
+  // FIXME: Enhance XOR constant extraction to handle nested binary operations.
+  // Currently we only extract disjoint bits from the immediate XOR constant,
+  // but we could recursively process cases like:
+  //   xor (add %base, C1), C2  ->  add %base, (C1 ^ disjoint_bits(C2))
+  // This requires careful analysis to ensure the transformation preserves
+  // semantics, particularly around sign extension and overflow behavior.
+
+  // Add the non-disjoint constant to the user chain for later transformation
+  // This will replace the original constant in the XOR with the new
+  // constant.
+  UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
+  return DisjointBits;
+}
+
 /// A helper function to check if reassociating through an entry in the user
 /// chain would invalidate the GEP's nuw flag.
 static bool allowsPreservingNUW(const User *U) {
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index f6959ca2..9b40fc0 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2144,23 +2144,9 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
 void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName,
                   bool CurrentLoopValid, bool PartiallyInvariant,
                   bool InjectedCondition, ArrayRef<Loop *> NewLoops) {
-  auto RecordLoopAsUnswitched = [&](Loop *TargetLoop, StringRef Tag,
-                                    StringRef DisableTag) {
-    auto &Ctx = TargetLoop->getHeader()->getContext();
-    MDNode *DisableMD = MDNode::get(Ctx, MDString::get(Ctx, DisableTag));
-    MDNode *NewLoopID = makePostTransformationMetadata(
-        Ctx, TargetLoop->getLoopID(), {Tag}, {DisableMD});
-    TargetLoop->setLoopID(NewLoopID);
-  };
-
-  // If we performed a non-trivial unswitch, we have added new cloned loops.
-  // Mark such newly-created loops as visited.
-  if (!NewLoops.empty()) {
-    for (Loop *NL : NewLoops)
-      RecordLoopAsUnswitched(NL, "llvm.loop.unswitch.nontrivial",
-                             "llvm.loop.unswitch.nontrivial.disable");
+  // If we did a non-trivial unswitch, we have added new (cloned) loops.
+  if (!NewLoops.empty())
     U.addSiblingLoops(NewLoops);
-  }
 
   // If the current loop remains valid, we should revisit it to catch any
   // other unswitch opportunities. Otherwise, we need to mark it as deleted.
@@ -2168,12 +2154,24 @@ void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName,
     if (PartiallyInvariant) {
       // Mark the new loop as partially unswitched, to avoid unswitching on
       // the same condition again.
-      RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.partial",
-                             "llvm.loop.unswitch.partial.disable");
+      auto &Context = L.getHeader()->getContext();
+      MDNode *DisableUnswitchMD = MDNode::get(
+          Context,
+          MDString::get(Context, "llvm.loop.unswitch.partial.disable"));
+      MDNode *NewLoopID = makePostTransformationMetadata(
+          Context, L.getLoopID(), {"llvm.loop.unswitch.partial"},
+          {DisableUnswitchMD});
+      L.setLoopID(NewLoopID);
     } else if (InjectedCondition) {
       // Do the same for injection of invariant conditions.
-      RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.injection",
-                             "llvm.loop.unswitch.injection.disable");
+      auto &Context = L.getHeader()->getContext();
+      MDNode *DisableUnswitchMD = MDNode::get(
+          Context,
+          MDString::get(Context, "llvm.loop.unswitch.injection.disable"));
+      MDNode *NewLoopID = makePostTransformationMetadata(
+          Context, L.getLoopID(), {"llvm.loop.unswitch.injection"},
+          {DisableUnswitchMD});
+      L.setLoopID(NewLoopID);
     } else
       U.revisitCurrentLoop();
   } else
@@ -2811,9 +2809,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
 }
 
 /// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
-/// candidates available. Also consider the number of "sibling" loops with
-/// the idea of accounting for previous unswitches that already happened on this
+/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
+/// candidates available. Also accounting for the number of "sibling" loops with
+/// the idea to account for previous unswitches that already happened on this
 /// cluster of loops. There was an attempt to keep this formula simple,
 /// just enough to limit the worst case behavior. Even if it is not that simple
 /// now it is still not an attempt to provide a detailed heuristic size
@@ -3509,9 +3507,8 @@ static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
   SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates;
   IVConditionInfo PartialIVInfo;
   Instruction *PartialIVCondBranch = nullptr;
-  if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.nontrivial.disable"))
-    collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
-                              PartialIVCondBranch, L, LI, AA, MSSAU);
+  collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
+                            PartialIVCondBranch, L, LI, AA, MSSAU);
   if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.injection.disable"))
     collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo,
                                             PartialIVCondBranch, L, DT, LI, AA,
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 44e63a0..b17dcb78 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -328,7 +328,7 @@ class StructurizeCFG {
   void addPhiValues(BasicBlock *From, BasicBlock *To);
 
   void findUndefBlocks(BasicBlock *PHIBlock,
-                       const SmallSet<BasicBlock *, 8> &Incomings,
+                       const SmallPtrSet<BasicBlock *, 8> &Incomings,
                        SmallVector<BasicBlock *> &UndefBlks) const;
 
   void mergeIfCompatible(EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A,
@@ -762,7 +762,7 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
 /// from some blocks as undefined. The function will find out all such blocks
 /// and return in \p UndefBlks.
 void StructurizeCFG::findUndefBlocks(
-    BasicBlock *PHIBlock, const SmallSet<BasicBlock *, 8> &Incomings,
+    BasicBlock *PHIBlock, const SmallPtrSet<BasicBlock *, 8> &Incomings,
     SmallVector<BasicBlock *> &UndefBlks) const {
   //  We may get a post-structured CFG like below:
   //
@@ -788,7 +788,7 @@ void StructurizeCFG::findUndefBlocks(
   // path N->F2->F3->B. For example, the threads take the branch F1->N may
   // always take the branch F2->P2. So, when we are reconstructing a PHI
   // originally in B, we can safely say the incoming value from N is undefined.
-  SmallSet<BasicBlock *, 8> VisitedBlock;
+  SmallPtrSet<BasicBlock *, 8> VisitedBlock;
   SmallVector<BasicBlock *, 8> Stack;
   if (PHIBlock == ParentRegion->getExit()) {
     for (auto P : predecessors(PHIBlock)) {
@@ -884,7 +884,7 @@ void StructurizeCFG::setPhiValues() {
 
     PhiMap &BlkPhis = OldPhiIt->second;
     SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
-    SmallSet<BasicBlock *, 8> Incomings;
+    SmallPtrSet<BasicBlock *, 8> Incomings;
 
     // Get the undefined blocks shared by all the phi nodes.
     if (!BlkPhis.empty()) {
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index ddd203f3..42b1fdf 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -111,15 +111,14 @@ BasicBlock *
 llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum,
                              const CriticalEdgeSplittingOptions &Options,
                              const Twine &BBName) {
-  assert(!isa<IndirectBrInst>(TI) &&
-         "Cannot split critical edge from IndirectBrInst");
-
   BasicBlock *TIBB = TI->getParent();
   BasicBlock *DestBB = TI->getSuccessor(SuccNum);
 
-  // Splitting the critical edge to a pad block is non-trivial. Don't do
-  // it in this generic function.
-  if (DestBB->isEHPad()) return nullptr;
+  // Splitting the critical edge to a pad block is non-trivial.
+  // And we cannot split block with IndirectBr as a terminator.
+  // Don't do it in this generic function.
+  if (DestBB->isEHPad() || isa<IndirectBrInst>(TI))
+    return nullptr;
 
   if (Options.IgnoreUnreachableDests &&
       isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime()))
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 40010ae..8044f61 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -193,7 +193,7 @@ bool CanonicalizeFreezeInLoopsImpl::run() {
   if (Candidates.empty())
     return false;
 
-  SmallSet<PHINode *, 8> ProcessedPHIs;
+  SmallPtrSet<PHINode *, 8> ProcessedPHIs;
   for (const auto &Info : Candidates) {
     PHINode *PHI = Info.PHI;
     if (!ProcessedPHIs.insert(Info.PHI).second)
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 7a9dd37..bbd1ed6 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1099,7 +1099,7 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
       // Get the memory operand of the lifetime marker. If the underlying
       // object is a sunk alloca, or is otherwise defined in the extraction
       // region, the lifetime marker must not be erased.
-      Value *Mem = II->getOperand(1)->stripInBoundsOffsets();
+      Value *Mem = II->getOperand(0);
       if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem))
         continue;
 
@@ -1115,8 +1115,6 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
 static void insertLifetimeMarkersSurroundingCall(
     Module *M, ArrayRef<Value *> LifetimesStart, ArrayRef<Value *> LifetimesEnd,
     CallInst *TheCall) {
-  LLVMContext &Ctx = M->getContext();
-  auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1);
   Instruction *Term = TheCall->getParent()->getTerminator();
 
   // Emit lifetime markers for the pointers given in \p Objects. Insert the
@@ -1130,7 +1128,7 @@ static void insertLifetimeMarkersSurroundingCall(
 
       Function *Func =
           Intrinsic::getOrInsertDeclaration(M, MarkerFunc, Mem->getType());
-      auto Marker = CallInst::Create(Func, {NegativeOne, Mem});
+      auto Marker = CallInst::Create(Func, Mem);
       if (InsertBefore)
         Marker->insertBefore(TheCall->getIterator());
       else
diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
index 4b0065d..8954de6 100644
--- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
@@ -276,7 +276,7 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize(
     DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
     const StringRef Prefix, std::optional<unsigned> MaxControlFlowBooleans) {
 #ifndef NDEBUG
-  SmallSet<BasicBlock *, 8> Incoming;
+  SmallPtrSet<BasicBlock *, 8> Incoming;
 #endif
   SetVector<BasicBlock *> Outgoing;
 
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 291e2a5..7063cde 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -706,6 +706,15 @@ bool llvm::checkDebugInfoMetadata(Module &M,
       DILocsBefore, DILocsAfter, InstToDelete, NameOfWrappedPass,
       FileNameFromCU, ShouldWriteIntoJSON, Bugs);
 
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
+  // If we are tracking DebugLoc coverage, replace each empty DebugLoc with an
+  // annotated location now so that it does not show up in future passes even if
+  // it is propagated to other instructions.
+  for (auto &L : DILocsAfter)
+    if (!L.second)
+      const_cast<Instruction *>(L.first)->setDebugLoc(DebugLoc::getUnknown());
+#endif
+
   bool ResultForVars = checkVars(DIVarsBefore, DIVarsAfter, NameOfWrappedPass,
                                  FileNameFromCU, ShouldWriteIntoJSON, Bugs);
 
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 540039b..0642d51 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -30,7 +30,7 @@ PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
     FunctionType *FuncTy =
         FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
 
-    const char *FuncName = RTLCI.getLibcallImplName(Impl);
+    StringRef FuncName = RTLCI.getLibcallImplName(Impl);
     M.getOrInsertFunction(FuncName, FuncTy);
   }
 
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 59a47a9..f49fbf8 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -77,7 +77,6 @@
 #include <cstdint>
 #include <deque>
 #include <iterator>
-#include <limits>
 #include <optional>
 #include <string>
 #include <utility>
@@ -3004,31 +3003,11 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
       if (hasLifetimeMarkers(AI))
         continue;
 
-      // Try to determine the size of the allocation.
-      ConstantInt *AllocaSize = nullptr;
-      if (ConstantInt *AIArraySize =
-          dyn_cast<ConstantInt>(AI->getArraySize())) {
-        auto &DL = Caller->getDataLayout();
-        Type *AllocaType = AI->getAllocatedType();
-        TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
-        uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
-
-        // Don't add markers for zero-sized allocas.
-        if (AllocaArraySize == 0)
-          continue;
-
-        // Check that array size doesn't saturate uint64_t and doesn't
-        // overflow when it's multiplied by type size.
-        if (!AllocaTypeSize.isScalable() &&
-            AllocaArraySize != std::numeric_limits<uint64_t>::max() &&
-            std::numeric_limits<uint64_t>::max() / AllocaArraySize >=
-                AllocaTypeSize.getFixedValue()) {
-          AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
-                                        AllocaArraySize * AllocaTypeSize);
-        }
-      }
+      std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout());
+      if (Size && Size->isZero())
+        continue;
 
-      builder.CreateLifetimeStart(AI, AllocaSize);
+      builder.CreateLifetimeStart(AI);
       for (ReturnInst *RI : Returns) {
         // Don't insert llvm.lifetime.end calls between a musttail or deoptimize
         // call and a return.  The return kills all local allocas.
@@ -3038,7 +3017,7 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
         if (InlinedDeoptimizeCalls &&
             RI->getParent()->getTerminatingDeoptimizeCall())
           continue;
-        IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
+        IRBuilder<>(RI).CreateLifetimeEnd(AI);
       }
     }
   }
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 2619e73..ac34490 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -275,7 +275,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       Builder.CreateBr(TheOnlyDest);
       BasicBlock *BB = SI->getParent();
 
-      SmallSet<BasicBlock *, 8> RemovedSuccessors;
+      SmallPtrSet<BasicBlock *, 8> RemovedSuccessors;
 
       // Remove entries from PHI nodes which we no longer branch to...
       BasicBlock *SuccToKeep = TheOnlyDest;
@@ -343,7 +343,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     if (auto *BA =
           dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
       BasicBlock *TheOnlyDest = BA->getBasicBlock();
-      SmallSet<BasicBlock *, 8> RemovedSuccessors;
+      SmallPtrSet<BasicBlock *, 8> RemovedSuccessors;
 
       // Insert the new branch.
       Builder.CreateBr(TheOnlyDest);
@@ -481,7 +481,7 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I,
       return true;
 
     if (II->isLifetimeStartOrEnd()) {
-      auto *Arg = II->getArgOperand(1);
+      auto *Arg = II->getArgOperand(0);
       if (isa<PoisonValue>(Arg))
         return true;
 
@@ -2518,7 +2518,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
   if (MSSAU)
     MSSAU->changeToUnreachable(I);
 
-  SmallSet<BasicBlock *, 8> UniqueSuccessors;
+  SmallPtrSet<BasicBlock *, 8> UniqueSuccessors;
 
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index e7623aa..2d830f3 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -914,6 +914,8 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
   switch (RK) {
   default:
     llvm_unreachable("Unexpected recurrence kind");
+  case RecurKind::AddChainWithSubs:
+  case RecurKind::Sub:
   case RecurKind::Add:
     return Intrinsic::vector_reduce_add;
   case RecurKind::Mul:
@@ -1301,6 +1303,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
                                  Builder.getFastMathFlags());
   };
   switch (RdxKind) {
+  case RecurKind::AddChainWithSubs:
+  case RecurKind::Sub:
   case RecurKind::Add:
   case RecurKind::Mul:
   case RecurKind::And:
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 472c03f..1f59b17 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -155,7 +155,7 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE,
     return;
   }
   if (auto *II = dyn_cast<LifetimeIntrinsic>(&Inst)) {
-    AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+    AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0));
     if (!AI ||
         getAllocaInterestingness(*AI) != AllocaInterestingness::kInteresting)
       return;
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index b22ecbc..978d5a2 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -20,7 +20,6 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -291,6 +290,11 @@ void PredicateInfoBuilder::convertUsesToDFSOrdered(
     Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
   for (auto &U : Op->uses()) {
     if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      // Lifetime intrinsics must work directly on alloca, do not replace them
+      // with a predicated copy.
+      if (I->isLifetimeStartOrEnd())
+        continue;
+
       ValueDFS VD;
       // Put the phi node uses in the incoming block.
       BasicBlock *IBlock;
@@ -370,6 +374,8 @@ void PredicateInfoBuilder::processAssume(
     Values.push_back(Cond);
     if (auto *Cmp = dyn_cast<CmpInst>(Cond))
       collectCmpOps(Cmp, Values);
+    else if (match(Cond, m_NUWTrunc(m_Value(Op0))))
+      Values.push_back(Op0);
 
     for (Value *V : Values) {
       if (shouldRename(V)) {
@@ -416,6 +422,8 @@ void PredicateInfoBuilder::processBranch(
       Values.push_back(Cond);
       if (auto *Cmp = dyn_cast<CmpInst>(Cond))
         collectCmpOps(Cmp, Values);
+      else if (match(Cond, m_NUWTrunc(m_Value(Op0))))
+        Values.push_back(Op0);
 
       for (Value *V : Values) {
         if (shouldRename(V)) {
@@ -506,23 +514,10 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
     ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin()
                              ? OrigOp
                              : (RenameStack.end() - Start - 1)->Def;
-    auto CreateSSACopy = [this](IRBuilderBase &B, Value *Op,
-                                const Twine &Name = "") {
-      auto It = PI.DeclarationCache.try_emplace(Op->getType());
-      if (It.second) {
-        // The number of named values is used to detect if a new declaration
-        // was added. If so, that declaration is tracked so that it can be
-        // removed when the analysis is done. The corner case were a new
-        // declaration results in a name clash and the old name being renamed
-        // is not considered as that represents an invalid module.
-        auto NumDecls = F.getParent()->getNumNamedValues();
-        Function *IF = Intrinsic::getOrInsertDeclaration(
-            F.getParent(), Intrinsic::ssa_copy, Op->getType());
-        if (NumDecls != F.getParent()->getNumNamedValues())
-          PI.CreatedDeclarations.insert(IF);
-        It.first->second = IF;
-      }
-      return B.CreateCall(It.first->second, Op, Name);
+    auto CreateSSACopy = [](Instruction *InsertPt, Value *Op,
+                            const Twine &Name = "") {
+      // Use a no-op bitcast to represent ssa copy.
+      return new BitCastInst(Op, Op->getType(), Name, InsertPt->getIterator());
     };
     // For edge predicates, we can just place the operand in the block before
     // the terminator. For assume, we have to place it right after the assume
@@ -530,9 +525,8 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
     // right before the terminator or after the assume, so that we insert in
     // proper order in the case of multiple predicateinfo in the same block.
     if (isa<PredicateWithEdge>(ValInfo)) {
-      IRBuilder<> B(getBranchTerminator(ValInfo));
-      CallInst *PIC =
-          CreateSSACopy(B, Op, Op->getName() + "." + Twine(Counter++));
+      BitCastInst *PIC = CreateSSACopy(getBranchTerminator(ValInfo), Op,
+                                       Op->getName() + "." + Twine(Counter++));
       PI.PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
     } else {
@@ -541,8 +535,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
              "Should not have gotten here without it being an assume");
       // Insert the predicate directly after the assume. While it also holds
       // directly before it, assume(i1 true) is not a useful fact.
-      IRBuilder<> B(PAssume->AssumeInst->getNextNode());
-      CallInst *PIC = CreateSSACopy(B, Op);
+      BitCastInst *PIC = CreateSSACopy(PAssume->AssumeInst->getNextNode(), Op);
       PI.PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
     }
@@ -710,23 +703,6 @@ PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
   Builder.buildPredicateInfo();
 }
 
-// Remove all declarations we created . The PredicateInfo consumers are
-// responsible for remove the ssa_copy calls created.
-PredicateInfo::~PredicateInfo() {
-  // Collect function pointers in set first, as SmallSet uses a SmallVector
-  // internally and we have to remove the asserting value handles first.
-  SmallPtrSet<Function *, 20> FunctionPtrs;
-  for (const auto &F : CreatedDeclarations)
-    FunctionPtrs.insert(&*F);
-  CreatedDeclarations.clear();
-
-  for (Function *F : FunctionPtrs) {
-    assert(F->users().empty() &&
-           "PredicateInfo consumer did not remove all SSA copies.");
-    F->eraseFromParent();
-  }
-}
-
 std::optional<PredicateConstraint> PredicateBase::getConstraint() const {
   switch (Type) {
   case PT_Assume:
@@ -741,6 +717,11 @@ std::optional<PredicateConstraint> PredicateBase::getConstraint() const {
                         : ConstantInt::getFalse(Condition->getType())}};
     }
 
+    if (match(Condition, m_NUWTrunc(m_Specific(RenamedOp)))) {
+      return {{TrueEdge ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ,
+               ConstantInt::getNullValue(RenamedOp->getType())}};
+    }
+
     CmpInst *Cmp = dyn_cast<CmpInst>(Condition);
     if (!Cmp) {
       // TODO: Make this an assertion once RenamedOp is fully accurate.
@@ -779,15 +760,16 @@ std::optional<PredicateConstraint> PredicateBase::getConstraint() const {
 
 void PredicateInfo::verifyPredicateInfo() const {}
 
-// Replace ssa_copy calls created by PredicateInfo with their operand.
+// Replace bitcasts created by PredicateInfo with their operand.
 static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
   for (Instruction &Inst : llvm::make_early_inc_range(instructions(F))) {
     const auto *PI = PredInfo.getPredicateInfoFor(&Inst);
-    auto *II = dyn_cast<IntrinsicInst>(&Inst);
-    if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
+    if (!PI)
       continue;
 
-    Inst.replaceAllUsesWith(II->getOperand(0));
+    assert(isa<BitCastInst>(Inst) &&
+           Inst.getType() == Inst.getOperand(0)->getType());
+    Inst.replaceAllUsesWith(Inst.getOperand(0));
     Inst.eraseFromParent();
   }
 }
diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
index 0ffea3f..41647f7 100644
--- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp
+++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
@@ -8,10 +8,8 @@
 
 #include "llvm/Transforms/Utils/ProfileVerify.h"
 #include "llvm/ADT/DynamicAPInt.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Analysis.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index d96f1d6..10c162b 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -136,7 +136,7 @@ public:
   /// \p ToDelete that stores to this alloca.
   void updateForDeletedStore(
       StoreInst *ToDelete, DIBuilder &DIB,
-      SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const {
+      SmallPtrSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const {
     // There's nothing to do if the alloca doesn't have any variables using
     // assignment tracking.
     if (DVRAssigns.empty())
@@ -382,7 +382,7 @@ struct PromoteMem2Reg {
   SmallVector<AssignmentTrackingInfo, 8> AllocaATInfo;
   /// A set of dbg.assigns to delete because they've been demoted to
   /// dbg.values. Call cleanUpDbgAssigns to delete them.
-  SmallSet<DbgVariableRecord *, 8> DVRAssignsToDelete;
+  SmallPtrSet<DbgVariableRecord *, 8> DVRAssignsToDelete;
 
   /// The set of basic blocks the renamer has already visited.
   BitVector Visited;
@@ -533,11 +533,10 @@ static void removeIntrinsicUsers(AllocaInst *AI) {
 /// false there were some loads which were not dominated by the single store
 /// and thus must be phi-ed with undef. We fall back to the standard alloca
 /// promotion algorithm in that case.
-static bool
-rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
-                         const DataLayout &DL, DominatorTree &DT,
-                         AssumptionCache *AC,
-                         SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
+static bool rewriteSingleStoreAlloca(
+    AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, const DataLayout &DL,
+    DominatorTree &DT, AssumptionCache *AC,
+    SmallPtrSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   StoreInst *OnlyStore = Info.OnlyStore;
   Value *ReplVal = OnlyStore->getOperand(0);
   // Loads may either load the stored value or uninitialized memory (undef).
@@ -647,11 +646,10 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
 ///      use(t);
 ///    *A = 42;
 ///  }
-static bool
-promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
-                         LargeBlockInfo &LBI, const DataLayout &DL,
-                         DominatorTree &DT, AssumptionCache *AC,
-                         SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
+static bool promoteSingleBlockAlloca(
+    AllocaInst *AI, const AllocaInfo &Info, LargeBlockInfo &LBI,
+    const DataLayout &DL, DominatorTree &DT, AssumptionCache *AC,
+    SmallPtrSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
   // significantly pessimize the small block case.  This uses LargeBlockInfo to
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index b78c702..8448517 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -777,10 +777,10 @@ public:
 
     for (BasicBlock &BB : F) {
       for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
-        if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
-          if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+        if (auto *BC = dyn_cast<BitCastInst>(&Inst)) {
+          if (BC->getType() == BC->getOperand(0)->getType()) {
             if (It->second->getPredicateInfoFor(&Inst)) {
-              Value *Op = II->getOperand(0);
+              Value *Op = BC->getOperand(0);
               Inst.replaceAllUsesWith(Op);
               Inst.eraseFromParent();
             }
@@ -1413,6 +1413,15 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
   if (ValueState[&I].isOverdefined())
     return;
 
+  if (auto *BC = dyn_cast<BitCastInst>(&I)) {
+    if (BC->getType() == BC->getOperand(0)->getType()) {
+      if (const PredicateBase *PI = getPredicateInfoFor(&I)) {
+        handlePredicate(&I, I.getOperand(0), PI);
+        return;
+      }
+    }
+  }
+
   ValueLatticeElement OpSt = getValueState(I.getOperand(0));
   if (OpSt.isUnknownOrUndef())
     return;
@@ -1433,8 +1442,12 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
         OpSt.asConstantRange(I.getSrcTy(), /*UndefAllowed=*/false);
 
     Type *DestTy = I.getDestTy();
-    ConstantRange Res =
-        OpRange.castOp(I.getOpcode(), DestTy->getScalarSizeInBits());
+    ConstantRange Res = ConstantRange::getEmpty(DestTy->getScalarSizeInBits());
+    if (auto *Trunc = dyn_cast<TruncInst>(&I))
+      Res = OpRange.truncate(DestTy->getScalarSizeInBits(),
+                             Trunc->getNoWrapKind());
+    else
+      Res = OpRange.castOp(I.getOpcode(), DestTy->getScalarSizeInBits());
     mergeInValue(LV, &I, ValueLatticeElement::getRange(Res));
   } else
     markOverdefined(&I);
@@ -2001,17 +2014,6 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
   Function *F = CB.getCalledFunction();
 
   if (auto *II = dyn_cast<IntrinsicInst>(&CB)) {
-    if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
-      if (ValueState[&CB].isOverdefined())
-        return;
-
-      Value *CopyOf = CB.getOperand(0);
-      const PredicateBase *PI = getPredicateInfoFor(&CB);
-      assert(PI && "Missing predicate info for ssa.copy");
-      handlePredicate(&CB, CopyOf, PI);
-      return;
-    }
-
     if (II->getIntrinsicID() == Intrinsic::vscale) {
       unsigned BitWidth = CB.getType()->getScalarSizeInBits();
       const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth);
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 1eb8996..e218db3 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1346,7 +1346,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     CanonicalIV->insertBefore(Header->begin());
     rememberInstruction(CanonicalIV);
 
-    SmallSet<BasicBlock *, 4> PredSeen;
+    SmallPtrSet<BasicBlock *, 4> PredSeen;
     Constant *One = ConstantInt::get(Ty, 1);
     for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
       BasicBlock *HP = *HPI;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index deabacc..055e8ca 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -291,6 +291,7 @@ class SimplifyCFGOpt {
   bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder);
   bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
   bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI);
 
   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
                                              IRBuilder<> &Builder);
@@ -564,6 +565,9 @@ struct ConstantComparesGatherer {
   /// Number of comparisons matched in the and/or chain
   unsigned UsedICmps = 0;
 
+  /// If the elements in Vals matches the comparisons
+  bool IsEq = false;
+
   /// Construct and compute the result for the comparison instruction Cond
   ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) : DL(DL) {
     gather(Cond);
@@ -735,23 +739,23 @@ private:
   /// vector.
   /// One "Extra" case is allowed to differ from the other.
   void gather(Value *V) {
-    bool isEQ = match(V, m_LogicalOr(m_Value(), m_Value()));
-
+    Value *Op0, *Op1;
+    if (match(V, m_LogicalOr(m_Value(Op0), m_Value(Op1))))
+      IsEq = true;
+    else if (match(V, m_LogicalAnd(m_Value(Op0), m_Value(Op1))))
+      IsEq = false;
+    else
+      return;
     // Keep a stack (SmallVector for efficiency) for depth-first traversal
-    SmallVector<Value *, 8> DFT;
-    SmallPtrSet<Value *, 8> Visited;
-
-    // Initialize
-    Visited.insert(V);
-    DFT.push_back(V);
+    SmallVector<Value *, 8> DFT{Op0, Op1};
+    SmallPtrSet<Value *, 8> Visited{V, Op0, Op1};
 
     while (!DFT.empty()) {
       V = DFT.pop_back_val();
 
       if (Instruction *I = dyn_cast<Instruction>(V)) {
         // If it is a || (or && depending on isEQ), process the operands.
-        Value *Op0, *Op1;
-        if (isEQ ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1)))
+        if (IsEq ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1)))
                  : match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
           if (Visited.insert(Op1).second)
             DFT.push_back(Op1);
@@ -762,7 +766,7 @@ private:
         }
 
         // Try to match the current instruction
-        if (matchInstruction(I, isEQ))
+        if (matchInstruction(I, IsEq))
           // Match succeed, continue the loop
           continue;
       }
@@ -810,11 +814,15 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) {
     if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors()))
       CV = SI->getCondition();
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
-    if (BI->isConditional() && BI->getCondition()->hasOneUse())
+    if (BI->isConditional() && BI->getCondition()->hasOneUse()) {
       if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
         if (ICI->isEquality() && getConstantInt(ICI->getOperand(1), DL))
           CV = ICI->getOperand(0);
+      } else if (auto *Trunc = dyn_cast<TruncInst>(BI->getCondition())) {
+        if (Trunc->hasNoUnsignedWrap())
+          CV = Trunc->getOperand(0);
       }
+    }
 
   // Unwrap any lossless ptrtoint cast.
   if (CV) {
@@ -840,11 +848,20 @@ BasicBlock *SimplifyCFGOpt::getValueEqualityComparisonCases(
   }
 
   BranchInst *BI = cast<BranchInst>(TI);
-  ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
-  BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
-  Cases.push_back(ValueEqualityComparisonCase(
-      getConstantInt(ICI->getOperand(1), DL), Succ));
-  return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
+  Value *Cond = BI->getCondition();
+  ICmpInst::Predicate Pred;
+  ConstantInt *C;
+  if (auto *ICI = dyn_cast<ICmpInst>(Cond)) {
+    Pred = ICI->getPredicate();
+    C = getConstantInt(ICI->getOperand(1), DL);
+  } else {
+    Pred = ICmpInst::ICMP_NE;
+    auto *Trunc = cast<TruncInst>(Cond);
+    C = ConstantInt::get(cast<IntegerType>(Trunc->getOperand(0)->getType()), 0);
+  }
+  BasicBlock *Succ = BI->getSuccessor(Pred == ICmpInst::ICMP_NE);
+  Cases.push_back(ValueEqualityComparisonCase(C, Succ));
+  return BI->getSuccessor(Pred == ICmpInst::ICMP_EQ);
 }
 
 /// Given a vector of bb/value pairs, remove any entries
@@ -1106,7 +1123,10 @@ static void getBranchWeights(Instruction *TI,
   // default weight to be the first entry.
   if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
     assert(Weights.size() == 2);
-    ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+    auto *ICI = dyn_cast<ICmpInst>(BI->getCondition());
+    if (!ICI)
+      return;
+
     if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
       std::swap(Weights.front(), Weights.back());
   }
@@ -3321,12 +3341,10 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
     //   %merge = select %cond, %two, %one
     //   store %merge, %x.dest, !DIAssignID !2
     //   dbg.assign %merge, "x", ..., !2
-    auto replaceVariable = [OrigV, S](auto *DbgAssign) {
+    for (DbgVariableRecord *DbgAssign :
+         at::getDVRAssignmentMarkers(SpeculatedStore))
       if (llvm::is_contained(DbgAssign->location_ops(), OrigV))
         DbgAssign->replaceVariableLocationOp(OrigV, S);
-    };
-    for_each(at::getAssignmentMarkers(SpeculatedStore), replaceVariable);
-    for_each(at::getDVRAssignmentMarkers(SpeculatedStore), replaceVariable);
   }
 
   // Metadata can be dependent on the condition we are hoisting above.
@@ -3655,15 +3673,19 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
   return false;
 }
 
-static bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI,
-                                                    DomTreeUpdater *DTU,
-                                                    const DataLayout &DL,
-                                                    AssumptionCache *AC) {
+bool SimplifyCFGOpt::foldCondBranchOnValueKnownInPredecessor(BranchInst *BI) {
+  // Note: If BB is a loop header then there is a risk that threading introduces
+  // a non-canonical loop by moving a back edge. So we avoid this optimization
+  // for loop headers if NeedCanonicalLoop is set.
+  if (Options.NeedCanonicalLoop && is_contained(LoopHeaders, BI->getParent()))
+    return false;
+
   std::optional<bool> Result;
   bool EverChanged = false;
   do {
     // Note that None means "we changed things, but recurse further."
-    Result = foldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, AC);
+    Result =
+        foldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, Options.AC);
     EverChanged |= Result == std::nullopt || *Result;
   } while (Result == std::nullopt);
   return EverChanged;
@@ -5084,6 +5106,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
   Value *CompVal = ConstantCompare.CompValue;
   unsigned UsedICmps = ConstantCompare.UsedICmps;
   Value *ExtraCase = ConstantCompare.Extra;
+  bool TrueWhenEqual = ConstantCompare.IsEq;
 
   // If we didn't have a multiply compared value, fail.
   if (!CompVal)
@@ -5093,8 +5116,6 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
   if (UsedICmps <= 1)
     return false;
 
-  bool TrueWhenEqual = match(Cond, m_LogicalOr(m_Value(), m_Value()));
-
   // There might be duplicate constants in the list, which the switch
   // instruction can't handle, remove them now.
   array_pod_sort(Values.begin(), Values.end(), constantIntSortPredicate);
@@ -8085,7 +8106,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // If this is a branch on something for which we know the constant value in
   // predecessors (e.g. a phi node in the current block), thread control
   // through this block.
-  if (foldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, Options.AC))
+  if (foldCondBranchOnValueKnownInPredecessor(BI))
     return requestResimplify();
 
   // Scan predecessor blocks for conditional branches.
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 737321d..2d6a748 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/DataLayout.h"
@@ -319,10 +320,10 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A
     annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue());
   } else if (isKnownNonZero(Size, DL)) {
     annotateNonNullNoUndefBasedOnAccess(CI, ArgNos);
-    const APInt *X, *Y;
+    uint64_t X, Y;
     uint64_t DerefMin = 1;
-    if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) {
-      DerefMin = std::min(X->getZExtValue(), Y->getZExtValue());
+    if (match(Size, m_Select(m_Value(), m_ConstantInt(X), m_ConstantInt(Y)))) {
+      DerefMin = std::min(X, Y);
       annotateDereferenceableBytes(CI, ArgNos, DerefMin);
     }
   }
@@ -977,8 +978,14 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
   // it's not very useful because calling strlen for a pointer of other types is
   // very uncommon.
   if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
-    // TODO: Handle subobjects.
-    if (!isGEPBasedOnPointerToString(GEP, CharSize))
+    unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+    SmallMapVector<Value *, APInt, 4> VarOffsets;
+    APInt ConstOffset(BW, 0);
+    assert(CharSize % 8 == 0 && "Expected a multiple of 8 sized CharSize");
+    // Check the gep is a single variable offset.
+    if (!GEP->collectOffset(DL, BW, VarOffsets, ConstOffset) ||
+        VarOffsets.size() != 1 || ConstOffset != 0 ||
+        VarOffsets.begin()->second != CharSize / 8)
       return nullptr;
 
     ConstantDataArraySlice Slice;
@@ -1000,10 +1007,8 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
           return nullptr;
       }
 
-      Value *Offset = GEP->getOperand(2);
+      Value *Offset = VarOffsets.begin()->first;
       KnownBits Known = computeKnownBits(Offset, DL, nullptr, CI, nullptr);
-      uint64_t ArrSize =
-             cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
 
       // If Offset is not provably in the range [0, NullTermIdx], we can still
       // optimize if we can prove that the program has undefined behavior when
@@ -1011,7 +1016,7 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
       // is a pointer to an object whose memory extent is NullTermIdx+1.
       if ((Known.isNonNegative() && Known.getMaxValue().ule(NullTermIdx)) ||
           (isa<GlobalVariable>(GEP->getOperand(0)) &&
-           NullTermIdx == ArrSize - 1)) {
+           NullTermIdx == Slice.Length - 1)) {
         Offset = B.CreateSExtOrTrunc(Offset, CI->getType());
         return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx),
                            Offset);
diff --git a/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp
index 6b18ece..c3ac39e 100644
--- a/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp
+++ b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp
@@ -12,7 +12,6 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -21,7 +20,6 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 
 #include <map>
-#include <string>
 #include <utility>
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index c47fd942..789047a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -793,280 +793,296 @@ static bool canWidenCallReturnType(Type *Ty) {
 }
 
 bool LoopVectorizationLegality::canVectorizeInstrs() {
-  BasicBlock *Header = TheLoop->getHeader();
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+  bool Result = true;
 
   // For each block in the loop.
   for (BasicBlock *BB : TheLoop->blocks()) {
     // Scan the instructions in the block and look for hazards.
     for (Instruction &I : *BB) {
-      if (auto *Phi = dyn_cast<PHINode>(&I)) {
-        Type *PhiTy = Phi->getType();
-        // Check that this PHI type is allowed.
-        if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
-            !PhiTy->isPointerTy()) {
-          reportVectorizationFailure("Found a non-int non-pointer PHI",
-                                     "loop control flow is not understood by vectorizer",
-                                     "CFGNotUnderstood", ORE, TheLoop);
-          return false;
-        }
+      Result &= canVectorizeInstr(I);
+      if (!DoExtraAnalysis && !Result)
+        return false;
+    }
+  }
 
-        // If this PHINode is not in the header block, then we know that we
-        // can convert it to select during if-conversion. No need to check if
-        // the PHIs in this block are induction or reduction variables.
-        if (BB != Header) {
-          // Non-header phi nodes that have outside uses can be vectorized. Add
-          // them to the list of allowed exits.
-          // Unsafe cyclic dependencies with header phis are identified during
-          // legalization for reduction, induction and fixed order
-          // recurrences.
-          AllowedExit.insert(&I);
-          continue;
-        }
+  if (!PrimaryInduction) {
+    if (Inductions.empty()) {
+      reportVectorizationFailure(
+          "Did not find one integer induction var",
+          "loop induction variable could not be identified",
+          "NoInductionVariable", ORE, TheLoop);
+      return false;
+    }
+    if (!WidestIndTy) {
+      reportVectorizationFailure(
+          "Did not find one integer induction var",
+          "integer loop induction variable could not be identified",
+          "NoIntegerInductionVariable", ORE, TheLoop);
+      return false;
+    }
+    LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+  }
 
-        // We only allow if-converted PHIs with exactly two incoming values.
-        if (Phi->getNumIncomingValues() != 2) {
-          reportVectorizationFailure("Found an invalid PHI",
-              "loop control flow is not understood by vectorizer",
-              "CFGNotUnderstood", ORE, TheLoop, Phi);
-          return false;
-        }
+  // Now we know the widest induction type, check if our found induction
+  // is the same size. If it's not, unset it here and InnerLoopVectorizer
+  // will create another.
+  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+    PrimaryInduction = nullptr;
 
-        RecurrenceDescriptor RedDes;
-        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
-                                                 DT, PSE.getSE())) {
-          Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
-          AllowedExit.insert(RedDes.getLoopExitInstr());
-          Reductions[Phi] = RedDes;
-          continue;
-        }
+  return Result;
+}
 
-        // We prevent matching non-constant strided pointer IVS to preserve
-        // historical vectorizer behavior after a generalization of the
-        // IVDescriptor code.  The intent is to remove this check, but we
-        // have to fix issues around code quality for such loops first.
-        auto IsDisallowedStridedPointerInduction =
-            [](const InductionDescriptor &ID) {
-              if (AllowStridedPointerIVs)
-                return false;
-              return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
-                     ID.getConstIntStepValue() == nullptr;
-            };
-
-        // TODO: Instead of recording the AllowedExit, it would be good to
-        // record the complementary set: NotAllowedExit. These include (but may
-        // not be limited to):
-        // 1. Reduction phis as they represent the one-before-last value, which
-        // is not available when vectorized
-        // 2. Induction phis and increment when SCEV predicates cannot be used
-        // outside the loop - see addInductionPhi
-        // 3. Non-Phis with outside uses when SCEV predicates cannot be used
-        // outside the loop - see call to hasOutsideLoopUser in the non-phi
-        // handling below
-        // 4. FixedOrderRecurrence phis that can possibly be handled by
-        // extraction.
-        // By recording these, we can then reason about ways to vectorize each
-        // of these NotAllowedExit.
-        InductionDescriptor ID;
-        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
-            !IsDisallowedStridedPointerInduction(ID)) {
-          addInductionPhi(Phi, ID, AllowedExit);
-          Requirements->addExactFPMathInst(ID.getExactFPMathInst());
-          continue;
-        }
+bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
+  BasicBlock *BB = I.getParent();
+  BasicBlock *Header = TheLoop->getHeader();
 
-        if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
-          AllowedExit.insert(Phi);
-          FixedOrderRecurrences.insert(Phi);
-          continue;
-        }
+  if (auto *Phi = dyn_cast<PHINode>(&I)) {
+    Type *PhiTy = Phi->getType();
+    // Check that this PHI type is allowed.
+    if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+        !PhiTy->isPointerTy()) {
+      reportVectorizationFailure(
+          "Found a non-int non-pointer PHI",
+          "loop control flow is not understood by vectorizer",
+          "CFGNotUnderstood", ORE, TheLoop);
+      return false;
+    }
 
-        // As a last resort, coerce the PHI to a AddRec expression
-        // and re-try classifying it a an induction PHI.
-        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
-            !IsDisallowedStridedPointerInduction(ID)) {
-          addInductionPhi(Phi, ID, AllowedExit);
-          continue;
-        }
+    // If this PHINode is not in the header block, then we know that we
+    // can convert it to select during if-conversion. No need to check if
+    // the PHIs in this block are induction or reduction variables.
+    if (BB != Header) {
+      // Non-header phi nodes that have outside uses can be vectorized. Add
+      // them to the list of allowed exits.
+      // Unsafe cyclic dependencies with header phis are identified during
+      // legalization for reduction, induction and fixed order
+      // recurrences.
+      AllowedExit.insert(&I);
+      return true;
+    }
 
-        reportVectorizationFailure("Found an unidentified PHI",
-            "value that could not be identified as "
-            "reduction is used outside the loop",
-            "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
-        return false;
-      } // end of PHI handling
-
-      // We handle calls that:
-      //   * Have a mapping to an IR intrinsic.
-      //   * Have a vector version available.
-      auto *CI = dyn_cast<CallInst>(&I);
-
-      if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
-          !(CI->getCalledFunction() && TLI &&
-            (!VFDatabase::getMappings(*CI).empty() ||
-             isTLIScalarize(*TLI, *CI)))) {
-        // If the call is a recognized math libary call, it is likely that
-        // we can vectorize it given loosened floating-point constraints.
-        LibFunc Func;
-        bool IsMathLibCall =
-            TLI && CI->getCalledFunction() &&
-            CI->getType()->isFloatingPointTy() &&
-            TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
-            TLI->hasOptimizedCodeGen(Func);
-
-        if (IsMathLibCall) {
-          // TODO: Ideally, we should not use clang-specific language here,
-          // but it's hard to provide meaningful yet generic advice.
-          // Also, should this be guarded by allowExtraAnalysis() and/or be part
-          // of the returned info from isFunctionVectorizable()?
-          reportVectorizationFailure(
-              "Found a non-intrinsic callsite",
-              "library call cannot be vectorized. "
-              "Try compiling with -fno-math-errno, -ffast-math, "
-              "or similar flags",
-              "CantVectorizeLibcall", ORE, TheLoop, CI);
-        } else {
-          reportVectorizationFailure("Found a non-intrinsic callsite",
-                                     "call instruction cannot be vectorized",
-                                     "CantVectorizeLibcall", ORE, TheLoop, CI);
-        }
-        return false;
-      }
+    // We only allow if-converted PHIs with exactly two incoming values.
+    if (Phi->getNumIncomingValues() != 2) {
+      reportVectorizationFailure(
+          "Found an invalid PHI",
+          "loop control flow is not understood by vectorizer",
+          "CFGNotUnderstood", ORE, TheLoop, Phi);
+      return false;
+    }
 
-      // Some intrinsics have scalar arguments and should be same in order for
-      // them to be vectorized (i.e. loop invariant).
-      if (CI) {
-        auto *SE = PSE.getSE();
-        Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
-        for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
-          if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) {
-            if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
-                                     TheLoop)) {
-              reportVectorizationFailure("Found unvectorizable intrinsic",
-                  "intrinsic instruction cannot be vectorized",
-                  "CantVectorizeIntrinsic", ORE, TheLoop, CI);
-              return false;
-            }
-          }
-      }
+    RecurrenceDescriptor RedDes;
+    if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT,
+                                             PSE.getSE())) {
+      Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
+      AllowedExit.insert(RedDes.getLoopExitInstr());
+      Reductions[Phi] = RedDes;
+      return true;
+    }
 
-      // If we found a vectorized variant of a function, note that so LV can
-      // make better decisions about maximum VF.
-      if (CI && !VFDatabase::getMappings(*CI).empty())
-        VecCallVariantsFound = true;
-
-      auto CanWidenInstructionTy = [](Instruction const &Inst) {
-        Type *InstTy = Inst.getType();
-        if (!isa<StructType>(InstTy))
-          return canVectorizeTy(InstTy);
-
-        // For now, we only recognize struct values returned from calls where
-        // all users are extractvalue as vectorizable. All element types of the
-        // struct must be types that can be widened.
-        return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
-               all_of(Inst.users(), IsaPred<ExtractValueInst>);
-      };
+    // We prevent matching non-constant strided pointer IVS to preserve
+    // historical vectorizer behavior after a generalization of the
+    // IVDescriptor code.  The intent is to remove this check, but we
+    // have to fix issues around code quality for such loops first.
+    auto IsDisallowedStridedPointerInduction =
+        [](const InductionDescriptor &ID) {
+          if (AllowStridedPointerIVs)
+            return false;
+          return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+                 ID.getConstIntStepValue() == nullptr;
+        };
+
+    // TODO: Instead of recording the AllowedExit, it would be good to
+    // record the complementary set: NotAllowedExit. These include (but may
+    // not be limited to):
+    // 1. Reduction phis as they represent the one-before-last value, which
+    // is not available when vectorized
+    // 2. Induction phis and increment when SCEV predicates cannot be used
+    // outside the loop - see addInductionPhi
+    // 3. Non-Phis with outside uses when SCEV predicates cannot be used
+    // outside the loop - see call to hasOutsideLoopUser in the non-phi
+    // handling below
+    // 4. FixedOrderRecurrence phis that can possibly be handled by
+    // extraction.
+    // By recording these, we can then reason about ways to vectorize each
+    // of these NotAllowedExit.
+    InductionDescriptor ID;
+    if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
+        !IsDisallowedStridedPointerInduction(ID)) {
+      addInductionPhi(Phi, ID, AllowedExit);
+      Requirements->addExactFPMathInst(ID.getExactFPMathInst());
+      return true;
+    }
 
-      // Check that the instruction return type is vectorizable.
-      // We can't vectorize casts from vector type to scalar type.
-      // Also, we can't vectorize extractelement instructions.
-      if (!CanWidenInstructionTy(I) ||
-          (isa<CastInst>(I) &&
-           !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
-          isa<ExtractElementInst>(I)) {
-        reportVectorizationFailure("Found unvectorizable type",
-            "instruction return type cannot be vectorized",
-            "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
-        return false;
-      }
+    if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
+      AllowedExit.insert(Phi);
+      FixedOrderRecurrences.insert(Phi);
+      return true;
+    }
+
+    // As a last resort, coerce the PHI to a AddRec expression
+    // and re-try classifying it a an induction PHI.
+    if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
+        !IsDisallowedStridedPointerInduction(ID)) {
+      addInductionPhi(Phi, ID, AllowedExit);
+      return true;
+    }
 
-      // Check that the stored type is vectorizable.
-      if (auto *ST = dyn_cast<StoreInst>(&I)) {
-        Type *T = ST->getValueOperand()->getType();
-        if (!VectorType::isValidElementType(T)) {
-          reportVectorizationFailure("Store instruction cannot be vectorized",
-                                     "CantVectorizeStore", ORE, TheLoop, ST);
+    reportVectorizationFailure("Found an unidentified PHI",
+                               "value that could not be identified as "
+                               "reduction is used outside the loop",
+                               "NonReductionValueUsedOutsideLoop", ORE, TheLoop,
+                               Phi);
+    return false;
+  } // end of PHI handling
+
+  // We handle calls that:
+  //   * Have a mapping to an IR intrinsic.
+  //   * Have a vector version available.
+  auto *CI = dyn_cast<CallInst>(&I);
+
+  if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+      !(CI->getCalledFunction() && TLI &&
+        (!VFDatabase::getMappings(*CI).empty() || isTLIScalarize(*TLI, *CI)))) {
+    // If the call is a recognized math libary call, it is likely that
+    // we can vectorize it given loosened floating-point constraints.
+    LibFunc Func;
+    bool IsMathLibCall =
+        TLI && CI->getCalledFunction() && CI->getType()->isFloatingPointTy() &&
+        TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
+        TLI->hasOptimizedCodeGen(Func);
+
+    if (IsMathLibCall) {
+      // TODO: Ideally, we should not use clang-specific language here,
+      // but it's hard to provide meaningful yet generic advice.
+      // Also, should this be guarded by allowExtraAnalysis() and/or be part
+      // of the returned info from isFunctionVectorizable()?
+      reportVectorizationFailure(
+          "Found a non-intrinsic callsite",
+          "library call cannot be vectorized. "
+          "Try compiling with -fno-math-errno, -ffast-math, "
+          "or similar flags",
+          "CantVectorizeLibcall", ORE, TheLoop, CI);
+    } else {
+      reportVectorizationFailure("Found a non-intrinsic callsite",
+                                 "call instruction cannot be vectorized",
+                                 "CantVectorizeLibcall", ORE, TheLoop, CI);
+    }
+    return false;
+  }
+
+  // Some intrinsics have scalar arguments and should be same in order for
+  // them to be vectorized (i.e. loop invariant).
+  if (CI) {
+    auto *SE = PSE.getSE();
+    Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
+    for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
+      if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) {
+        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), TheLoop)) {
+          reportVectorizationFailure(
+              "Found unvectorizable intrinsic",
+              "intrinsic instruction cannot be vectorized",
+              "CantVectorizeIntrinsic", ORE, TheLoop, CI);
           return false;
         }
+      }
+  }
 
-        // For nontemporal stores, check that a nontemporal vector version is
-        // supported on the target.
-        if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
-          // Arbitrarily try a vector of 2 elements.
-          auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
-          assert(VecTy && "did not find vectorized version of stored type");
-          if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
-            reportVectorizationFailure(
-                "nontemporal store instruction cannot be vectorized",
-                "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
-            return false;
-          }
-        }
+  // If we found a vectorized variant of a function, note that so LV can
+  // make better decisions about maximum VF.
+  if (CI && !VFDatabase::getMappings(*CI).empty())
+    VecCallVariantsFound = true;
+
+  auto CanWidenInstructionTy = [](Instruction const &Inst) {
+    Type *InstTy = Inst.getType();
+    if (!isa<StructType>(InstTy))
+      return canVectorizeTy(InstTy);
+
+    // For now, we only recognize struct values returned from calls where
+    // all users are extractvalue as vectorizable. All element types of the
+    // struct must be types that can be widened.
+    return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
+           all_of(Inst.users(), IsaPred<ExtractValueInst>);
+  };
 
-      } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
-        if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
-          // For nontemporal loads, check that a nontemporal vector version is
-          // supported on the target (arbitrarily try a vector of 2 elements).
-          auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
-          assert(VecTy && "did not find vectorized version of load type");
-          if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
-            reportVectorizationFailure(
-                "nontemporal load instruction cannot be vectorized",
-                "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
-            return false;
-          }
-        }
+  // Check that the instruction return type is vectorizable.
+  // We can't vectorize casts from vector type to scalar type.
+  // Also, we can't vectorize extractelement instructions.
+  if (!CanWidenInstructionTy(I) ||
+      (isa<CastInst>(I) &&
+       !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
+      isa<ExtractElementInst>(I)) {
+    reportVectorizationFailure("Found unvectorizable type",
+                               "instruction return type cannot be vectorized",
+                               "CantVectorizeInstructionReturnType", ORE,
+                               TheLoop, &I);
+    return false;
+  }
+
+  // Check that the stored type is vectorizable.
+  if (auto *ST = dyn_cast<StoreInst>(&I)) {
+    Type *T = ST->getValueOperand()->getType();
+    if (!VectorType::isValidElementType(T)) {
+      reportVectorizationFailure("Store instruction cannot be vectorized",
+                                 "CantVectorizeStore", ORE, TheLoop, ST);
+      return false;
+    }
 
-        // FP instructions can allow unsafe algebra, thus vectorizable by
-        // non-IEEE-754 compliant SIMD units.
-        // This applies to floating-point math operations and calls, not memory
-        // operations, shuffles, or casts, as they don't change precision or
-        // semantics.
-      } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
-                 !I.isFast()) {
-        LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
-        Hints->setPotentiallyUnsafe();
+    // For nontemporal stores, check that a nontemporal vector version is
+    // supported on the target.
+    if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
+      // Arbitrarily try a vector of 2 elements.
+      auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
+      assert(VecTy && "did not find vectorized version of stored type");
+      if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
+        reportVectorizationFailure(
+            "nontemporal store instruction cannot be vectorized",
+            "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
+        return false;
       }
+    }
 
-      // Reduction instructions are allowed to have exit users.
-      // All other instructions must not have external users.
-      if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
-        // We can safely vectorize loops where instructions within the loop are
-        // used outside the loop only if the SCEV predicates within the loop is
-        // same as outside the loop. Allowing the exit means reusing the SCEV
-        // outside the loop.
-        if (PSE.getPredicate().isAlwaysTrue()) {
-          AllowedExit.insert(&I);
-          continue;
-        }
-        reportVectorizationFailure("Value cannot be used outside the loop",
-                                   "ValueUsedOutsideLoop", ORE, TheLoop, &I);
+  } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
+    if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
+      // For nontemporal loads, check that a nontemporal vector version is
+      // supported on the target (arbitrarily try a vector of 2 elements).
+      auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
+      assert(VecTy && "did not find vectorized version of load type");
+      if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
+        reportVectorizationFailure(
+            "nontemporal load instruction cannot be vectorized",
+            "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
         return false;
       }
-    } // next instr.
+    }
+
+    // FP instructions can allow unsafe algebra, thus vectorizable by
+    // non-IEEE-754 compliant SIMD units.
+    // This applies to floating-point math operations and calls, not memory
+    // operations, shuffles, or casts, as they don't change precision or
+    // semantics.
+  } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+             !I.isFast()) {
+    LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+    Hints->setPotentiallyUnsafe();
   }
 
-  if (!PrimaryInduction) {
-    if (Inductions.empty()) {
-      reportVectorizationFailure("Did not find one integer induction var",
-          "loop induction variable could not be identified",
-          "NoInductionVariable", ORE, TheLoop);
-      return false;
-    }
-    if (!WidestIndTy) {
-      reportVectorizationFailure("Did not find one integer induction var",
-          "integer loop induction variable could not be identified",
-          "NoIntegerInductionVariable", ORE, TheLoop);
-      return false;
+  // Reduction instructions are allowed to have exit users.
+  // All other instructions must not have external users.
+  if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+    // We can safely vectorize loops where instructions within the loop are
+    // used outside the loop only if the SCEV predicates within the loop is
+    // same as outside the loop. Allowing the exit means reusing the SCEV
+    // outside the loop.
+    if (PSE.getPredicate().isAlwaysTrue()) {
+      AllowedExit.insert(&I);
+      return true;
     }
-    LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    reportVectorizationFailure("Value cannot be used outside the loop",
+                               "ValueUsedOutsideLoop", ORE, TheLoop, &I);
+    return false;
   }
 
-  // Now we know the widest induction type, check if our found induction
-  // is the same size. If it's not, unset it here and InnerLoopVectorizer
-  // will create another.
-  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
-    PrimaryInduction = nullptr;
-
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 912c893..838476d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -256,13 +256,15 @@ public:
         new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
                           GEPNoWrapFlags::none(), DL, Name));
   }
-  VPInstruction *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset,
-                                      DebugLoc DL = DebugLoc::getUnknown(),
-                                      const Twine &Name = "") {
-    return tryInsertInstruction(
-        new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
-                          GEPNoWrapFlags::inBounds(), DL, Name));
+
+  VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset,
+                                    GEPNoWrapFlags GEPFlags,
+                                    DebugLoc DL = DebugLoc::getUnknown(),
+                                    const Twine &Name = "") {
+    return tryInsertInstruction(new VPInstruction(
+        VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name));
   }
+
   VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset,
                                   DebugLoc DL = DebugLoc::getUnknown(),
                                   const Twine &Name = "") {
@@ -276,6 +278,20 @@ public:
     return tryInsertInstruction(new VPPhi(IncomingValues, DL, Name));
   }
 
+  VPValue *createElementCount(Type *Ty, ElementCount EC) {
+    VPlan &Plan = *getInsertBlock()->getPlan();
+    VPValue *RuntimeEC =
+        Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue()));
+    if (EC.isScalable()) {
+      VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty);
+      RuntimeEC = EC.getKnownMinValue() == 1
+                      ? VScale
+                      : createOverflowingOp(Instruction::Mul,
+                                            {VScale, RuntimeEC}, {true, false});
+    }
+    return RuntimeEC;
+  }
+
   /// Convert the input value \p Current to the corresponding value of an
   /// induction with \p Start and \p Step values, using \p Start + \p Current *
   /// \p Step.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index be00fd6..70f8840 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -499,19 +499,18 @@ class InnerLoopVectorizer {
 public:
   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                       LoopInfo *LI, DominatorTree *DT,
-                      const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, AssumptionCache *AC,
-                      OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
+                      ElementCount VecWidth,
                       ElementCount MinProfitableTripCount,
                       unsigned UnrollFactor, LoopVectorizationCostModel *CM,
                       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
                       GeneratedRTChecks &RTChecks, VPlan &Plan)
-      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        AC(AC), ORE(ORE), VF(VecWidth),
-        MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
-        Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
-        RTChecks(RTChecks), Plan(Plan),
-        VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {}
+      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
+        VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount),
+        UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM),
+        BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+        VectorPHVPBB(cast<VPBasicBlock>(
+            Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
 
   virtual ~InnerLoopVectorizer() = default;
 
@@ -548,9 +547,6 @@ public:
 protected:
   friend class LoopVectorizationPlanner;
 
-  /// Returns (and creates if needed) the trip count of the widened loop.
-  Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
-
   // Create a check to see if the vector loop should be executed
   Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
 
@@ -586,18 +582,12 @@ protected:
   /// Dominator Tree.
   DominatorTree *DT;
 
-  /// Target Library Info.
-  const TargetLibraryInfo *TLI;
-
   /// Target Transform Info.
   const TargetTransformInfo *TTI;
 
   /// Assumption Cache.
   AssumptionCache *AC;
 
-  /// Interface to emit optimization remarks.
-  OptimizationRemarkEmitter *ORE;
-
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
   ElementCount VF;
@@ -619,9 +609,6 @@ protected:
   /// The scalar-loop preheader.
   BasicBlock *LoopScalarPreHeader = nullptr;
 
-  /// Middle Block between the vector and the scalar.
-  BasicBlock *LoopMiddleBlock = nullptr;
-
   /// Trip count of the original loop.
   Value *TripCount = nullptr;
 
@@ -648,7 +635,7 @@ protected:
 
   /// The vector preheader block of \p Plan, used as target for check blocks
   /// introduced during skeleton creation.
-  VPBlockBase *VectorPHVPB;
+  VPBasicBlock *VectorPHVPBB;
 };
 
 /// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -686,14 +673,14 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
 public:
   InnerLoopAndEpilogueVectorizer(
       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
-      DominatorTree *DT, const TargetLibraryInfo *TLI,
-      const TargetTransformInfo *TTI, AssumptionCache *AC,
-      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
-      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
-      ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
-      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                            EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
-                            BFI, PSI, Checks, Plan),
+      DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
+      EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+      GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
+      ElementCount MinProfitableTripCount, unsigned UnrollFactor)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
+                            MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
+                            Checks, Plan),
         EPI(EPI) {}
 
   // Override this function to handle the more complex control flow around the
@@ -721,15 +708,17 @@ public:
 /// epilogues.
 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
 public:
-  EpilogueVectorizerMainLoop(
-      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
-      DominatorTree *DT, const TargetLibraryInfo *TLI,
-      const TargetTransformInfo *TTI, AssumptionCache *AC,
-      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
-      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
-      ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
-      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                                       EPI, CM, BFI, PSI, Check, Plan) {}
+  EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                             LoopInfo *LI, DominatorTree *DT,
+                             const TargetTransformInfo *TTI,
+                             AssumptionCache *AC,
+                             EpilogueLoopVectorizationInfo &EPI,
+                             LoopVectorizationCostModel *CM,
+                             BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+                             GeneratedRTChecks &Check, VPlan &Plan)
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
+                                       BFI, PSI, Check, Plan, EPI.MainLoopVF,
+                                       EPI.MainLoopVF, EPI.MainLoopUF) {}
   /// Implements the interface for creating a vectorized skeleton using the
   /// *main loop* strategy (ie the first pass of vplan execution).
   BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
@@ -750,13 +739,13 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
 public:
   EpilogueVectorizerEpilogueLoop(
       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
-      DominatorTree *DT, const TargetLibraryInfo *TLI,
-      const TargetTransformInfo *TTI, AssumptionCache *AC,
-      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
-      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
-      ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
-      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                                       EPI, CM, BFI, PSI, Checks, Plan) {
+      DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
+      EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+      GeneratedRTChecks &Checks, VPlan &Plan)
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
+                                       BFI, PSI, Checks, Plan, EPI.EpilogueVF,
+                                       EPI.EpilogueVF, EPI.EpilogueUF) {
     TripCount = EPI.TripCount;
   }
   /// Implements the interface for creating a vectorized skeleton using the
@@ -835,7 +824,14 @@ namespace llvm {
 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
                        int64_t Step) {
   assert(Ty->isIntegerTy() && "Expected an integer step");
-  return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
+  ElementCount VFxStep = VF.multiplyCoefficientBy(Step);
+  assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
+  if (VF.isScalable() && isPowerOf2_64(Step)) {
+    return B.CreateShl(
+        B.CreateVScale(Ty),
+        ConstantInt::get(Ty, Log2_64(VFxStep.getKnownMinValue())), "", true);
+  }
+  return B.CreateElementCount(Ty, VFxStep);
 }
 
 /// Return the runtime value for VF.
@@ -2272,65 +2268,15 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
   return TTI.enableMaskedInterleavedAccessVectorization();
 }
 
-Value *
-InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
-  if (VectorTripCount)
-    return VectorTripCount;
-
-  Value *TC = getTripCount();
-  IRBuilder<> Builder(InsertBlock->getTerminator());
-
-  Type *Ty = TC->getType();
-  // This is where we can make the step a runtime constant.
-  Value *Step = createStepForVF(Builder, Ty, VF, UF);
-
-  // If the tail is to be folded by masking, round the number of iterations N
-  // up to a multiple of Step instead of rounding down. This is done by first
-  // adding Step-1 and then rounding down. Note that it's ok if this addition
-  // overflows: the vector induction variable will eventually wrap to zero given
-  // that it starts at zero and its Step is a power of two; the loop will then
-  // exit, with the last early-exit vector comparison also producing all-true.
-  // For scalable vectors the VF is not guaranteed to be a power of 2, but this
-  // is accounted for in emitIterationCountCheck that adds an overflow check.
-  if (Cost->foldTailByMasking()) {
-    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
-           "VF*UF must be a power of 2 when folding tail by masking");
-    TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
-                           "n.rnd.up");
-  }
-
-  // Now we need to generate the expression for the part of the loop that the
-  // vectorized body will execute. This is equal to N - (N % Step) if scalar
-  // iterations are not required for correctness, or N - Step, otherwise. Step
-  // is equal to the vectorization factor (number of SIMD elements) times the
-  // unroll factor (number of SIMD instructions).
-  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
-
-  // There are cases where we *must* run at least one iteration in the remainder
-  // loop.  See the cost model for when this can happen.  If the step evenly
-  // divides the trip count, we set the remainder to be equal to the step. If
-  // the step does not evenly divide the trip count, no adjustment is necessary
-  // since there will already be scalar iterations. Note that the minimum
-  // iterations check ensures that N >= Step.
-  if (Cost->requiresScalarEpilogue(VF.isVector())) {
-    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
-    R = Builder.CreateSelect(IsZero, Step, R);
-  }
-
-  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
-
-  return VectorTripCount;
-}
-
 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   // Note: The block with the minimum trip-count check is already connected
   // during earlier VPlan construction.
   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
-  VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
+  VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
   assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
   assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
   VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
-  VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
+  VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB);
   PreVectorPH = CheckVPIRBB;
   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
   PreVectorPH->swapSuccessors();
@@ -2359,7 +2305,10 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
   // Reuse existing vector loop preheader for TC checks.
   // Note that new preheader block is generated for vector loop.
   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
-  IRBuilder<> Builder(TCCheckBlock->getTerminator());
+  IRBuilder<InstSimplifyFolder> Builder(
+      TCCheckBlock->getContext(),
+      InstSimplifyFolder(TCCheckBlock->getDataLayout()));
+  Builder.SetInsertPoint(TCCheckBlock->getTerminator());
 
   // If tail is to be folded, vector loop takes care of all iterations.
   Value *Count = getTripCount();
@@ -2371,7 +2320,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
       return createStepForVF(Builder, CountTy, VF, UF);
 
     Value *MinProfTC =
-        createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
+        Builder.CreateElementCount(CountTy, MinProfitableTripCount);
     if (!VF.isScalable())
       return MinProfTC;
     return Builder.CreateBinaryIntrinsic(
@@ -2437,16 +2386,20 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
+                                             BasicBlock *IRBB) {
   VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
-  for (auto &R : make_early_inc_range(*VPBB)) {
-    assert((IRVPBB->empty() || IRVPBB->back().isPhi() || !R.isPhi()) &&
-           "Tried to move phi recipe after a non-phi recipe");
+  auto IP = IRVPBB->begin();
+  for (auto &R : make_early_inc_range(VPBB->phis()))
+    R.moveBefore(*IRVPBB, IP);
+
+  for (auto &R :
+       make_early_inc_range(make_range(VPBB->getFirstNonPhi(), VPBB->end())))
     R.moveBefore(*IRVPBB, IRVPBB->end());
-  }
 
   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
   // VPBB is now dead and will be cleaned up when the plan gets destroyed.
+  return IRVPBB;
 }
 
 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
@@ -2549,7 +2502,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // to the scalar loop.
   emitIterationCountCheck(LoopScalarPreHeader);
 
-  replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
+  replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
   return LoopVectorPreHeader;
 }
 
@@ -2680,19 +2633,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
   fixNonInductionPHIs(State);
 
-  // After vectorization, the exit blocks of the original loop will have
-  // additional predecessors. Invalidate SCEVs for the exit phis in case SE
-  // looked through single-entry phis.
-  SmallVector<BasicBlock *> ExitBlocks;
-  OrigLoop->getExitBlocks(ExitBlocks);
-  for (BasicBlock *Exit : ExitBlocks)
-    for (PHINode &PN : Exit->phis())
-      PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
-
-  // Forget the original basic block.
-  PSE.getSE()->forgetLoop(OrigLoop);
-  PSE.getSE()->forgetBlockAndLoopDispositions();
-
   // Don't apply optimizations below when no (vector) loop remains, as they all
   // require one at the moment.
   VPBasicBlock *HeaderVPBB =
@@ -2734,11 +2674,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
       // Make sure the builder has a valid insert point.
       Builder.SetInsertPoint(NewPhi);
-      for (unsigned Idx = 0; Idx < VPPhi->getNumIncoming(); ++Idx) {
-        VPValue *Inc = VPPhi->getIncomingValue(Idx);
-        const VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
+      for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
         NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
-      }
     }
   }
 }
@@ -3158,6 +3095,12 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   if (Group->isReverse())
     return false;
 
+  // TODO: Support interleaved access that requires a gap mask for scalable VFs.
+  bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
+                          StoreAccessWithGapsRequiresMasking;
+  if (VF.isScalable() && NeedsMaskForGaps)
+    return false;
+
   auto *Ty = getLoadStoreType(I);
   const Align Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
@@ -4069,8 +4012,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
       if (VF.isScalar())
         continue;
 
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
-                            CM, CM.CostKind);
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4178,7 +4120,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
                                 const TargetTransformInfo &TTI) {
   assert(VF.isVector() && "Checking a scalar VF?");
-  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+  VPTypeAnalysis TypeInfo(Plan);
   DenseSet<VPRecipeBase *> EphemeralRecipes;
   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
   // Set of already visited types.
@@ -4326,8 +4268,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
       // cost model.
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
-                            CM, CM.CostKind);
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
       assert(VectorRegion && "Expected to have a vector region!");
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -5272,8 +5213,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
 
   // Get the cost of the scalar memory instruction and address computation.
-  InstructionCost Cost =
-      VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+  InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost(
+                                                  PtrTy, SE, PtrSCEV, CostKind);
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
@@ -5344,11 +5285,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   assert(Legal->isUniformMemOp(*I, VF));
 
   Type *ValTy = getLoadStoreType(I);
+  Type *PtrTy = getLoadStorePointerOperand(I)->getType();
   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   if (isa<LoadInst>(I)) {
-    return TTI.getAddressComputationCost(ValTy) +
+    return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
                                CostKind) +
            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy,
@@ -5361,13 +5303,13 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
   // the actual generated code, which involves extracting the last element of
   // a scalable vector where the lane to extract is unknown at compile time.
-  return TTI.getAddressComputationCost(ValTy) +
-         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
-                             CostKind) +
-         (IsLoopInvariantStoreValue
-              ? 0
-              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       CostKind, VF.getKnownMinValue() - 1));
+  InstructionCost Cost =
+      TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
+      TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
+  if (!IsLoopInvariantStoreValue)
+    Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
+                                                 VectorTy, CostKind, 0);
+  return Cost;
 }
 
 InstructionCost
@@ -5377,8 +5319,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
   const Value *Ptr = getLoadStorePointerOperand(I);
+  Type *PtrTy = toVectorTy(Ptr->getType(), VF);
 
-  return TTI.getAddressComputationCost(VectorTy) +
+  return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
                                     Legal->isMaskRequired(I), Alignment,
                                     CostKind, I);
@@ -5613,11 +5556,12 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   // moment.
   if (VF.isScalar()) {
     Type *ValTy = getLoadStoreType(I);
+    Type *PtrTy = getLoadStorePointerOperand(I)->getType();
     const Align Alignment = getLoadStoreAlignment(I);
     unsigned AS = getLoadStoreAddressSpace(I);
 
     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
-    return TTI.getAddressComputationCost(ValTy) +
+    return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
                                OpInfo, I);
   }
@@ -6976,8 +6920,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
-                        CM.CostKind);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -7178,8 +7121,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // simplifications not accounted for in the legacy cost model. If that's the
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
-                        CM.CostKind);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
@@ -7317,10 +7259,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     ++LoopsEarlyExitVectorized;
   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
   // cost model is complete for better cost estimates.
-  VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
-                           OrigLoop->getHeader()->getContext());
-  VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
+  VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
+  VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
   VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
+  VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
   bool HasBranchWeights =
       hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
   if (HasBranchWeights) {
@@ -7339,21 +7281,25 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
 
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
-  VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
+  VPlanTransforms::simplifyRecipes(BestVPlan);
   VPlanTransforms::removeBranchOnConst(BestVPlan);
   VPlanTransforms::narrowInterleaveGroups(
       BestVPlan, BestVF,
       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
   VPlanTransforms::removeDeadRecipes(BestVPlan);
 
-  VPlanTransforms::convertToConcreteRecipes(BestVPlan,
-                                            *Legal->getWidestInductionType());
+  VPlanTransforms::convertToConcreteRecipes(BestVPlan);
   // Regions are dissolved after optimizing for VF and UF, which completely
   // removes unneeded loop regions first.
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
   // Canonicalize EVL loops after regions are dissolved.
   VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
   VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
+  VPlanTransforms::materializeVectorTripCount(
+      BestVPlan, VectorPH, CM.foldTailByMasking(),
+      CM.requiresScalarEpilogue(BestVF.isVector()));
+  VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF);
+  VPlanTransforms::simplifyRecipes(BestVPlan);
 
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7393,12 +7339,28 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BasicBlock *EntryBB =
       cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
-  if (VectorizingEpilogue)
-    VPlanTransforms::removeDeadRecipes(BestVPlan);
+  replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(),
+                        State.CFG.PrevBB->getSingleSuccessor());
+  VPlanTransforms::removeDeadRecipes(BestVPlan);
 
   assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
          "final VPlan is invalid");
 
+  // After vectorization, the exit blocks of the original loop will have
+  // additional predecessors. Invalidate SCEVs for the exit phis in case SE
+  // looked through single-entry phis.
+  ScalarEvolution &SE = *PSE.getSE();
+  for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
+    if (Exit->getNumPredecessors() == 0)
+      continue;
+    for (VPRecipeBase &PhiR : Exit->phis())
+      SE.forgetLcssaPhiWithNewPredecessor(
+          OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction()));
+  }
+  // Forget the original loop and block dispositions.
+  SE.forgetLoop(OrigLoop);
+  SE.forgetBlockAndLoopDispositions();
+
   ILV.printDebugTracesAtStart();
 
   //===------------------------------------------------===//
@@ -7409,11 +7371,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   //
   //===------------------------------------------------===//
 
-  // 2. Copy and widen instructions from the old loop into the new loop.
-  BestVPlan.prepareToExecute(
-      ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
-  replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
-
   // Move check blocks to their final position.
   // TODO: Move as part of VPIRBB execute and update impacted tests.
   if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
@@ -7530,7 +7487,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
   EPI.MainLoopIterationCountCheck =
       emitIterationCountCheck(LoopScalarPreHeader, false);
 
-  replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
   return LoopVectorPreHeader;
 }
 
@@ -7557,8 +7513,9 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
   assert(Bypass && "Expected valid bypass basic block.");
   Value *Count = getTripCount();
   MinProfitableTripCount = ElementCount::getFixed(0);
-  Value *CheckMinIters = createIterationCountCheck(
-      ForEpilogue ? EPI.EpilogueVF : VF, ForEpilogue ? EPI.EpilogueUF : UF);
+  Value *CheckMinIters =
+      createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
+                                ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
 
   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
   if (!ForEpilogue)
@@ -7568,12 +7525,13 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
                                    static_cast<DominatorTree *>(nullptr), LI,
                                    nullptr, "vector.ph");
-
   if (ForEpilogue) {
     // Save the trip count so we don't have to regenerate it in the
     // vec.epilog.iter.check. This is safe to do because the trip count
     // generated here dominates the vector epilog iter check.
     EPI.TripCount = Count;
+  } else {
+    VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
   }
 
   BranchInst &BI =
@@ -7607,6 +7565,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
   BasicBlock *VecEpilogueIterationCountCheck =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
                  nullptr, "vec.epilog.iter.check", true);
+  VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+
   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
                                           VecEpilogueIterationCountCheck);
   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
@@ -7661,7 +7621,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
       Phi->removeIncomingValue(MemCheckBlock);
   }
 
-  replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
   return LoopVectorPreHeader;
 }
 
@@ -7690,11 +7649,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-    // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
-    // think the MainLoopStep is correct.
-    unsigned MainLoopStep = UF * VF.getKnownMinValue();
+    auto VScale = Cost->getVScaleForTuning();
+    unsigned MainLoopStep =
+        estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
     unsigned EpilogueLoopStep =
-        EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
+        estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
     // We assume the remaining `Count` is equally distributed in
     // [0, MainLoopStep)
     // So the probability for `Count < EpilogueLoopStep` should be
@@ -8159,7 +8118,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   // extends are intended to be lowered along with the reduction itself.
 
   // Build up a set of partial reduction ops for efficient use checking.
-  SmallSet<User *, 4> PartialReductionOps;
+  SmallPtrSet<User *, 4> PartialReductionOps;
   for (const auto &[PartialRdx, _] : PartialReductionChains)
     PartialReductionOps.insert(PartialRdx.ExtendUser);
 
@@ -8435,8 +8394,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
     LVer.prepareNoAliasMetadata();
   }
 
+  // Create initial base VPlan0, to serve as common starting point for all
+  // candidates built later for specific VF ranges.
+  auto VPlan0 = VPlanTransforms::buildVPlan0(
+      OrigLoop, *LI, Legal->getWidestInductionType(),
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+
   auto MaxVFTimes2 = MaxVF * 2;
-  auto VPlan0 = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
@@ -8500,7 +8464,7 @@ static VPInstruction *addResumePhiRecipeForInduction(
 /// \p IVEndValues.
 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
                                 DenseMap<VPValue *, VPValue *> &IVEndValues) {
-  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+  VPTypeAnalysis TypeInfo(Plan);
   auto *ScalarPH = Plan.getScalarPreheader();
   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
@@ -8675,23 +8639,17 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // visit each basic block after having visited its predecessor basic blocks.
   // ---------------------------------------------------------------------------
 
-  // Create initial VPlan skeleton, having a basic block for the pre-header
-  // which contains SCEV expansions that need to happen before the CFG is
-  // modified; a basic block for the vector pre-header, followed by a region for
-  // the vector loop, followed by the middle basic block. The skeleton vector
-  // loop region contains a header and latch basic blocks.
-
   bool RequiresScalarEpilogueCheck =
       LoopVectorizationPlanner::getDecisionAndClampRange(
           [this](ElementCount VF) {
             return !CM.requiresScalarEpilogue(VF.isVector());
           },
           Range);
-  VPlanTransforms::prepareForVectorization(
-      *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
-      CM.foldTailByMasking(), OrigLoop,
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()),
-      Legal->hasUncountableEarlyExit(), Range);
+  VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(),
+                                    Range);
+  VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
+                                  CM.foldTailByMasking());
+
   VPlanTransforms::createLoopRegions(*Plan);
   VPlanTransforms::createExtractsForLiveOuts(*Plan);
 
@@ -8889,8 +8847,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
-                          CM.CostKind);
+    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
   }
@@ -8977,11 +8934,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   assert(!OrigLoop->isInnermost());
   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
 
-  auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
-  VPlanTransforms::prepareForVectorization(
-      *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop,
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false,
-      Range);
+  auto Plan = VPlanTransforms::buildVPlan0(
+      OrigLoop, *LI, Legal->getWidestInductionType(),
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+  VPlanTransforms::handleEarlyExits(*Plan,
+                                    /*HasUncountableExit*/ false, Range);
+  VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
+                                  /*TailFolded*/ false);
+
   VPlanTransforms::createLoopRegions(*Plan);
 
   for (ElementCount VF : Range)
@@ -9114,6 +9074,16 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
             CurrentLinkI->getFastMathFlags());
         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
         VecOp = FMulRecipe;
+      } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
+                 CurrentLinkI->getOpcode() == Instruction::Sub) {
+        Type *PhiTy = PhiR->getUnderlyingValue()->getType();
+        auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
+        VPWidenRecipe *Sub = new VPWidenRecipe(
+            Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
+            VPIRMetadata(), CurrentLinkI->getDebugLoc());
+        Sub->setUnderlyingValue(CurrentLinkI);
+        LinkVPBB->insert(Sub, CurrentLink->getIterator());
+        VecOp = Sub;
       } else {
         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
           if (isa<VPWidenRecipe>(CurrentLink)) {
@@ -9407,13 +9377,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
       State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
       cast_if_present<BinaryOperator>(FPBinOp));
   DerivedIV->setName(Name);
-  // If index is the vector trip count, the concrete value will only be set in
-  // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
-  // TODO: Remove the special case for the vector trip count once it is computed
-  // in VPlan and can be used during VPlan simplification.
-  assert((DerivedIV != Index ||
-          getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
-         "IV didn't need transforming?");
   State.set(this, DerivedIV, VPLane(0));
 }
 
@@ -9515,8 +9478,8 @@ static bool processLoopInVPlanNativePath(
 
   {
     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
-                           VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
+    InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM,
+                           BFI, PSI, Checks, BestPlan);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                       << L->getHeader()->getParent()->getName() << "\"\n");
     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -9798,6 +9761,9 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
     else if (&*MainScalarPH->begin() != ResumePhi)
       ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
   }
+  // Add a user to to make sure the resume phi won't get removed.
+  VPBuilder(MainScalarPH)
+      .createNaryOp(VPInstruction::ResumeForEpilogue, ResumePhi);
 }
 
 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
@@ -10171,8 +10137,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
-                          CM, CM.CostKind);
+    VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
+                          CM.CostKind);
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
                                      LVP.getPlanFor(VF.Width), SEL,
@@ -10223,8 +10189,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     }
   } else if (IC > 1 && UserIC == 1) {
     // Tell the user interleaving is beneficial, but it explicitly disabled.
-    LLVM_DEBUG(
-        dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
+    LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
+                         "disabled.\n");
     IntDiagMsg = {"InterleavingBeneficialButDisabled",
                   "the cost-model indicates that interleaving is beneficial "
                   "but is explicitly disabled or interleave count is set to 1"};
@@ -10295,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       // interleave it.
       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
       InnerLoopVectorizer Unroller(
-          L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
+          L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1),
           ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
 
       // TODO: Move to general VPlan pipeline once epilogue loops are also
@@ -10330,20 +10296,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
                                           BestEpiPlan);
-        EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
-                                           EPI, &CM, BFI, PSI, Checks,
-                                           *BestMainPlan);
+        EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
+                                           BFI, PSI, Checks, *BestMainPlan);
         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
                                              *BestMainPlan, MainILV, DT, false);
         ++LoopsVectorized;
 
         // Second pass vectorizes the epilogue and adjusts the control flow
         // edges from the first pass.
-        EPI.MainLoopVF = EPI.EpilogueVF;
-        EPI.MainLoopUF = EPI.EpilogueUF;
-        EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
-                                                 ORE, EPI, &CM, BFI, PSI,
-                                                 Checks, BestEpiPlan);
+        EpilogueVectorizerEpilogueLoop EpilogILV(
+            L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan);
         EpilogILV.setTripCount(MainILV.getTripCount());
         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
 
@@ -10368,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         if (!Checks.hasChecks())
           DisableRuntimeUnroll = true;
       } else {
-        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
+        InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width,
                                VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
                                Checks, BestPlan);
         // TODO: Move to general VPlan pipeline once epilogue loops are also
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 39011e7..37dc414 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -525,17 +525,17 @@ static bool isSplat(ArrayRef<Value *> VL) {
 /// instructions, we need to use the converted opcode along with the original
 /// uses.
 /// \param I The instruction to check for commutativity
-/// \param InstWithUses The instruction whose uses are analyzed for special
+/// \param ValWithUses The value whose uses are analyzed for special
 /// patterns
-static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
+static bool isCommutative(Instruction *I, Value *ValWithUses) {
   if (auto *Cmp = dyn_cast<CmpInst>(I))
     return Cmp->isCommutative();
   if (auto *BO = dyn_cast<BinaryOperator>(I))
     return BO->isCommutative() ||
            (BO->getOpcode() == Instruction::Sub &&
-            !InstWithUses->hasNUsesOrMore(UsesLimit) &&
+            !ValWithUses->hasNUsesOrMore(UsesLimit) &&
             all_of(
-                InstWithUses->uses(),
+                ValWithUses->uses(),
                 [](const Use &U) {
                   // Commutative, if icmp eq/ne sub, 0
                   CmpPredicate Pred;
@@ -552,8 +552,8 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
                           Flag->isOne());
                 })) ||
            (BO->getOpcode() == Instruction::FSub &&
-            !InstWithUses->hasNUsesOrMore(UsesLimit) &&
-            all_of(InstWithUses->uses(), [](const Use &U) {
+            !ValWithUses->hasNUsesOrMore(UsesLimit) &&
+            all_of(ValWithUses->uses(), [](const Use &U) {
               return match(U.getUser(),
                            m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
             }));
@@ -570,6 +570,19 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
 /// \returns true if the instruction is commutative, false otherwise
 static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
 
+/// \returns number of operands of \p I, considering commutativity. Returns 2
+/// for commutative instrinsics.
+/// \param I The instruction to check for commutativity
+static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) {
+  if (isa<IntrinsicInst>(I) && isCommutative(I)) {
+    // IntrinsicInst::isCommutative returns true if swapping the first "two"
+    // arguments to the intrinsic produces the same result.
+    constexpr unsigned IntrinsicNumOperands = 2;
+    return IntrinsicNumOperands;
+  }
+  return I->getNumOperands();
+}
+
 template <typename T>
 static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
                                                      unsigned Offset) {
@@ -862,6 +875,16 @@ static std::optional<unsigned> getExtractIndex(const Instruction *E) {
 }
 
 namespace llvm {
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all operands are either not instructions
+/// or phi nodes or instructions from different blocks.
+static bool areAllOperandsNonInsts(Value *V);
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all users are phi nodes or instructions
+/// from the different blocks.
+static bool isUsedOutsideBlock(Value *V);
 /// Checks if the specified value does not require scheduling. It does not
 /// require scheduling if all operands and all users do not need to be scheduled
 /// in the current basic block.
@@ -1307,6 +1330,7 @@ public:
       : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
   static InstructionsState invalid() { return {nullptr, nullptr}; }
 
+  /// Checks if the value is a copyable element.
   bool isCopyableElement(Value *V) const {
     assert(valid() && "InstructionsState is invalid.");
     if (!HasCopyables)
@@ -1338,6 +1362,8 @@ public:
              doesNotNeedToBeScheduled(V);
     // MainOp for copyables always schedulable to correctly identify
     // non-schedulable copyables.
+    if (getMainOp() == V)
+      return false;
     if (isCopyableElement(V)) {
       auto IsNonSchedulableCopyableElement = [this](Value *V) {
         auto *I = dyn_cast<Instruction>(V);
@@ -1355,6 +1381,7 @@ public:
            doesNotNeedToBeScheduled(V);
   }
 
+  /// Checks if the state represents copyable instructions.
   bool areInstructionsWithCopyableElements() const {
     assert(valid() && "InstructionsState is invalid.");
     return HasCopyables;
@@ -1886,6 +1913,7 @@ class BoUpSLP {
   class TreeEntry;
   class ScheduleEntity;
   class ScheduleData;
+  class ScheduleCopyableData;
   class ScheduleBundle;
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
@@ -2246,6 +2274,7 @@ public:
 
     operator bool() const { return UserTE != nullptr; }
   };
+  friend struct DenseMapInfo<EdgeInfo>;
 
   /// A helper class used for scoring candidates for two consecutive lanes.
   class LookAheadHeuristics {
@@ -2384,6 +2413,11 @@ public:
       if (C1 && C2)
         return LookAheadHeuristics::ScoreConstants;
 
+      // Consider constants and buildvector compatible.
+      if ((C1 && isa<InsertElementInst>(V2)) ||
+          (C2 && isa<InsertElementInst>(V1)))
+        return LookAheadHeuristics::ScoreConstants;
+
       // Extracts from consecutive indexes of the same vector better score as
       // the extracts could be optimized away.
       Value *EV1;
@@ -3010,10 +3044,9 @@ public:
       assert(S.valid() && "InstructionsState is invalid.");
       // IntrinsicInst::isCommutative returns true if swapping the first "two"
       // arguments to the intrinsic produces the same result.
-      constexpr unsigned IntrinsicNumOperands = 2;
       Instruction *MainOp = S.getMainOp();
       unsigned NumOperands = MainOp->getNumOperands();
-      ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
+      ArgSize = ::getNumberOfPotentiallyCommutativeOps(MainOp);
       OpsVec.resize(ArgSize);
       unsigned NumLanes = VL.size();
       for (OperandDataVec &Ops : OpsVec)
@@ -3038,7 +3071,7 @@ public:
         bool IsInverseOperation = false;
         if (S.isCopyableElement(VL[Lane])) {
           // The value is a copyable element.
-          IsInverseOperation = !isCommutative(MainOp);
+          IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
         } else {
           assert(I && "Expected instruction");
           auto [SelectedOp, Ops] = convertTo(I, S);
@@ -4332,7 +4365,10 @@ private:
     } else {
       // Build a map for gathered scalars to the nodes where they are used.
       bool AllConstsOrCasts = true;
-      for (Value *V : VL)
+      for (Value *V : VL) {
+        if (S && S.areInstructionsWithCopyableElements() &&
+            S.isCopyableElement(V))
+          Last->addCopyableElement(V);
         if (!isConstant(V)) {
           auto *I = dyn_cast<CastInst>(V);
           AllConstsOrCasts &= I && I->getType()->isIntegerTy();
@@ -4340,6 +4376,7 @@ private:
               !UserTreeIdx.UserTE->isGather())
             ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
         }
+      }
       if (AllConstsOrCasts)
         CastMaxMinBWSizes =
             std::make_pair(std::numeric_limits<unsigned>::max(), 1);
@@ -4518,8 +4555,6 @@ private:
   bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
                  Instruction *Inst2) {
     assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
-    if (!isSimple(Inst2))
-      return true;
     // First check if the result is already in the cache.
     AliasCacheKey Key = std::make_pair(Inst1, Inst2);
     auto Res = AliasCache.try_emplace(Key);
@@ -4528,7 +4563,6 @@ private:
     bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
     // Store the result in the cache.
     Res.first->getSecond() = Aliased;
-    AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
     return Aliased;
   }
 
@@ -4587,16 +4621,18 @@ private:
   /// List of hashes of vector of loads, which are known to be non vectorizable.
   DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
 
-  /// Represents a scheduling entity, either ScheduleData or ScheduleBundle.
-  /// ScheduleData used to gather dependecies for a single instructions, while
-  /// ScheduleBundle represents a batch of instructions, going to be groupped
-  /// together.
+  /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
+  /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
+  /// instructions, while ScheduleBundle represents a batch of instructions,
+  /// going to be groupped together. ScheduleCopyableData models extra user for
+  /// "copyable" instructions.
   class ScheduleEntity {
     friend class ScheduleBundle;
     friend class ScheduleData;
+    friend class ScheduleCopyableData;
 
   protected:
-    enum class Kind { ScheduleData, ScheduleBundle };
+    enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
     Kind getKind() const { return K; }
     ScheduleEntity(Kind K) : K(K) {}
 
@@ -4615,17 +4651,79 @@ private:
     void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
     int getSchedulingPriority() const { return SchedulingPriority; }
     bool isReady() const {
-      if (auto *SD = dyn_cast<ScheduleData>(this))
+      if (const auto *SD = dyn_cast<ScheduleData>(this))
         return SD->isReady();
+      if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+        return CD->isReady();
       return cast<ScheduleBundle>(this)->isReady();
     }
+    /// Returns true if the dependency information has been calculated.
+    /// Note that depenendency validity can vary between instructions within
+    /// a single bundle.
+    bool hasValidDependencies() const {
+      if (const auto *SD = dyn_cast<ScheduleData>(this))
+        return SD->hasValidDependencies();
+      if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+        return CD->hasValidDependencies();
+      return cast<ScheduleBundle>(this)->hasValidDependencies();
+    }
+    /// Gets the number of unscheduled dependencies.
+    int getUnscheduledDeps() const {
+      if (const auto *SD = dyn_cast<ScheduleData>(this))
+        return SD->getUnscheduledDeps();
+      if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+        return CD->getUnscheduledDeps();
+      return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
+    }
+    /// Increments the number of unscheduled dependencies.
+    int incrementUnscheduledDeps(int Incr) {
+      if (auto *SD = dyn_cast<ScheduleData>(this))
+        return SD->incrementUnscheduledDeps(Incr);
+      return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
+    }
+    /// Gets the number of dependencies.
+    int getDependencies() const {
+      if (const auto *SD = dyn_cast<ScheduleData>(this))
+        return SD->getDependencies();
+      return cast<ScheduleCopyableData>(this)->getDependencies();
+    }
+    /// Gets the instruction.
+    Instruction *getInst() const {
+      if (const auto *SD = dyn_cast<ScheduleData>(this))
+        return SD->getInst();
+      return cast<ScheduleCopyableData>(this)->getInst();
+    }
+
     /// Gets/sets if the bundle is scheduled.
     bool isScheduled() const { return IsScheduled; }
     void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
 
     static bool classof(const ScheduleEntity *) { return true; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump(raw_ostream &OS) const {
+      if (const auto *SD = dyn_cast<ScheduleData>(this))
+        return SD->dump(OS);
+      if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+        return CD->dump(OS);
+      return cast<ScheduleBundle>(this)->dump(OS);
+    }
+
+    LLVM_DUMP_METHOD void dump() const {
+      dump(dbgs());
+      dbgs() << '\n';
+    }
+#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   };
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  friend inline raw_ostream &operator<<(raw_ostream &OS,
+                                        const BoUpSLP::ScheduleEntity &SE) {
+    SE.dump(OS);
+    return OS;
+  }
+#endif
+
   /// Contains all scheduling relevant data for an instruction.
   /// A ScheduleData either represents a single instruction or a member of an
   /// instruction bundle (= a group of instructions which is combined into a
@@ -4688,10 +4786,18 @@ private:
 
     /// Clears all dependency information.
     void clearDependencies() {
-      Dependencies = InvalidDeps;
-      resetUnscheduledDeps();
+      clearDirectDependencies();
       MemoryDependencies.clear();
       ControlDependencies.clear();
+    }
+
+    /// Clears all direct dependencies only, except for control and memory
+    /// dependencies.
+    /// Required for copyable elements to correctly handle control/memory deps
+    /// and avoid extra reclaculation of such deps.
+    void clearDirectDependencies() {
+      Dependencies = InvalidDeps;
+      resetUnscheduledDeps();
       IsScheduled = false;
     }
 
@@ -4781,7 +4887,7 @@ private:
 
   class ScheduleBundle final : public ScheduleEntity {
     /// The schedule data for the instructions in the bundle.
-    SmallVector<ScheduleData *> Bundle;
+    SmallVector<ScheduleEntity *> Bundle;
     /// True if this bundle is valid.
     bool IsValid = true;
     /// The TreeEntry that this instruction corresponds to.
@@ -4797,7 +4903,7 @@ private:
 
     /// Verify basic self consistency properties
     void verify() const {
-      for (const ScheduleData *SD : Bundle) {
+      for (const ScheduleEntity *SD : Bundle) {
         if (SD->hasValidDependencies()) {
           assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
                  "invariant");
@@ -4817,7 +4923,7 @@ private:
     int unscheduledDepsInBundle() const {
       assert(*this && "bundle must not be empty");
       int Sum = 0;
-      for (const ScheduleData *BundleMember : Bundle) {
+      for (const ScheduleEntity *BundleMember : Bundle) {
         if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
           return ScheduleData::InvalidDeps;
         Sum += BundleMember->getUnscheduledDeps();
@@ -4829,7 +4935,7 @@ private:
     /// Note that depenendency validity can vary between instructions within
     /// a single bundle.
     bool hasValidDependencies() const {
-      return all_of(Bundle, [](const ScheduleData *SD) {
+      return all_of(Bundle, [](const ScheduleEntity *SD) {
         return SD->hasValidDependencies();
       });
     }
@@ -4843,10 +4949,10 @@ private:
 
     /// Returns the bundle of scheduling data, associated with the current
     /// instruction.
-    ArrayRef<ScheduleData *> getBundle() { return Bundle; }
-    ArrayRef<const ScheduleData *> getBundle() const { return Bundle; }
+    ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
+    ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
     /// Adds an instruction to the bundle.
-    void add(ScheduleData *SD) { Bundle.push_back(SD); }
+    void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
 
     /// Gets/sets the associated tree entry.
     void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
@@ -4863,8 +4969,11 @@ private:
         return;
       }
       OS << '[';
-      interleaveComma(Bundle, OS,
-                      [&](const ScheduleData *SD) { OS << *SD->getInst(); });
+      interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
+        if (isa<ScheduleCopyableData>(SD))
+          OS << "<Copyable>";
+        OS << *SD->getInst();
+      });
       OS << ']';
     }
 
@@ -4883,6 +4992,129 @@ private:
   }
 #endif
 
+  /// Contains all scheduling relevant data for the copyable instruction.
+  /// It models the virtual instructions, supposed to replace the original
+  /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
+  /// %1], where %1 = add, then the ScheduleCopyableData models virtual
+  /// instruction %virt = add %0, 0.
+  class ScheduleCopyableData final : public ScheduleEntity {
+    /// The source schedule data for the instruction.
+    Instruction *Inst = nullptr;
+    /// The edge information for the instruction.
+    const EdgeInfo EI;
+    /// This ScheduleData is in the current scheduling region if this matches
+    /// the current SchedulingRegionID of BlockScheduling.
+    int SchedulingRegionID = 0;
+    /// Bundle, this data is part of.
+    ScheduleBundle &Bundle;
+
+  public:
+    ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
+                         const EdgeInfo &EI, ScheduleBundle &Bundle)
+        : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
+          SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
+    static bool classof(const ScheduleEntity *Entity) {
+      return Entity->getKind() == Kind::ScheduleCopyableData;
+    }
+
+    /// Verify basic self consistency properties
+    void verify() {
+      if (hasValidDependencies()) {
+        assert(UnscheduledDeps <= Dependencies && "invariant");
+      } else {
+        assert(UnscheduledDeps == Dependencies && "invariant");
+      }
+
+      if (IsScheduled) {
+        assert(hasValidDependencies() && UnscheduledDeps == 0 &&
+               "unexpected scheduled state");
+      }
+    }
+
+    /// Returns true if the dependency information has been calculated.
+    /// Note that depenendency validity can vary between instructions within
+    /// a single bundle.
+    bool hasValidDependencies() const {
+      return Dependencies != ScheduleData::InvalidDeps;
+    }
+
+    /// Returns true if it is ready for scheduling, i.e. it has no more
+    /// unscheduled depending instructions/bundles.
+    bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
+
+    /// Modifies the number of unscheduled dependencies for this instruction,
+    /// and returns the number of remaining dependencies for the containing
+    /// bundle.
+    int incrementUnscheduledDeps(int Incr) {
+      assert(hasValidDependencies() &&
+             "increment of unscheduled deps would be meaningless");
+      UnscheduledDeps += Incr;
+      assert(UnscheduledDeps >= 0 && "invariant");
+      return UnscheduledDeps;
+    }
+
+    /// Sets the number of unscheduled dependencies to the number of
+    /// dependencies.
+    void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
+
+    /// Gets the number of unscheduled dependencies.
+    int getUnscheduledDeps() const { return UnscheduledDeps; }
+    /// Gets the number of dependencies.
+    int getDependencies() const { return Dependencies; }
+    /// Initializes the number of dependencies.
+    void initDependencies() { Dependencies = 0; }
+    /// Increments the number of dependencies.
+    void incDependencies() { Dependencies++; }
+
+    /// Gets scheduling region ID.
+    int getSchedulingRegionID() const { return SchedulingRegionID; }
+
+    /// Gets the instruction.
+    Instruction *getInst() const { return Inst; }
+
+    /// Clears all dependency information.
+    void clearDependencies() {
+      Dependencies = ScheduleData::InvalidDeps;
+      UnscheduledDeps = ScheduleData::InvalidDeps;
+      IsScheduled = false;
+    }
+
+    /// Gets the edge information.
+    const EdgeInfo &getEdgeInfo() const { return EI; }
+
+    /// Gets the bundle.
+    ScheduleBundle &getBundle() { return Bundle; }
+    const ScheduleBundle &getBundle() const { return Bundle; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
+
+    LLVM_DUMP_METHOD void dump() const {
+      dump(dbgs());
+      dbgs() << '\n';
+    }
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+  private:
+    /// true, if it has valid dependency information. These nodes always have
+    /// only single dependency.
+    int Dependencies = ScheduleData::InvalidDeps;
+
+    /// The number of dependencies minus the number of dependencies of scheduled
+    /// instructions. As soon as this is zero, the instruction/bundle gets ready
+    /// for scheduling.
+    /// Note that this is negative as long as Dependencies is not calculated.
+    int UnscheduledDeps = ScheduleData::InvalidDeps;
+  };
+
+#ifndef NDEBUG
+  friend inline raw_ostream &
+  operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
+    SD.dump(OS);
+    return OS;
+  }
+#endif
+
   friend struct GraphTraits<BoUpSLP *>;
   friend struct DOTGraphTraits<BoUpSLP *>;
 
@@ -4909,6 +5141,10 @@ private:
     void clear() {
       ScheduledBundles.clear();
       ScheduledBundlesList.clear();
+      ScheduleCopyableDataMap.clear();
+      ScheduleCopyableDataMapByInst.clear();
+      ScheduleCopyableDataMapByInstUser.clear();
+      ScheduleCopyableDataMapByUsers.clear();
       ReadyInsts.clear();
       ScheduleStart = nullptr;
       ScheduleEnd = nullptr;
@@ -4935,7 +5171,7 @@ private:
         // Avoid lookup if can't possibly be in map.
         return nullptr;
       ScheduleData *SD = ScheduleDataMap.lookup(I);
-      if (SD && isInSchedulingRegion(SD))
+      if (SD && isInSchedulingRegion(*SD))
         return SD;
       return nullptr;
     }
@@ -4944,6 +5180,201 @@ private:
       return getScheduleData(dyn_cast<Instruction>(V));
     }
 
+    /// Returns the ScheduleCopyableData for the given edge (user tree entry and
+    /// operand number) and value.
+    ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
+                                                  const Value *V) const {
+      if (ScheduleCopyableDataMap.empty())
+        return nullptr;
+      auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
+      if (It == ScheduleCopyableDataMap.end())
+        return nullptr;
+      ScheduleCopyableData *SD = It->getSecond().get();
+      if (!isInSchedulingRegion(*SD))
+        return nullptr;
+      return SD;
+    }
+
+    /// Returns the ScheduleCopyableData for the given user \p User, operand
+    /// number and operand \p V.
+    SmallVector<ScheduleCopyableData *>
+    getScheduleCopyableData(const Value *User, unsigned OperandIdx,
+                            const Value *V) {
+      if (ScheduleCopyableDataMapByInstUser.empty())
+        return {};
+      const auto It = ScheduleCopyableDataMapByInstUser.find(
+          std::make_pair(std::make_pair(User, OperandIdx), V));
+      if (It == ScheduleCopyableDataMapByInstUser.end())
+        return {};
+      SmallVector<ScheduleCopyableData *> Res;
+      for (ScheduleCopyableData *SD : It->getSecond()) {
+        if (isInSchedulingRegion(*SD))
+          Res.push_back(SD);
+      }
+      return Res;
+    }
+
+    /// Returns true if all operands of the given instruction \p User are
+    /// replaced by copyable data.
+    /// \param User The user instruction.
+    /// \param Op The operand, which might be replaced by the copyable data.
+    /// \param SLP The SLP tree.
+    /// \param NumOps The number of operands used. If the instruction uses the
+    /// same operand several times, check for the first use, then the second,
+    /// etc.
+    bool areAllOperandsReplacedByCopyableData(Instruction *User,
+                                              Instruction *Op, BoUpSLP &SLP,
+                                              unsigned NumOps) const {
+      assert(NumOps > 0 && "No operands");
+      if (ScheduleCopyableDataMap.empty())
+        return false;
+      SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
+      SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
+      for (const Use &U : User->operands()) {
+        if (U.get() != Op)
+          continue;
+        ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
+        if (Entries.empty())
+          return false;
+        // Check all tree entries, if they have operands replaced by copyable
+        // data.
+        for (TreeEntry *TE : Entries) {
+          // Check if the user is commutative.
+          // The commutatives are handled later, as their oeprands can be
+          // reordered.
+          // Same applies even for non-commutative cmps, because we can invert
+          // their predicate potentially and, thus, reorder the operands.
+          bool IsCommutativeUser =
+              ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
+          EdgeInfo EI(TE, U.getOperandNo());
+          if (!IsCommutativeUser && !isa<CmpInst>(User)) {
+            unsigned &OpCnt =
+                OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
+            if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
+              return false;
+            // Found copyable operand - continue.
+            ++OpCnt;
+            continue;
+          }
+          ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
+                .first->getSecond();
+        }
+      }
+      // Check the commutative/cmp entries.
+      if (!PotentiallyReorderedEntriesCount.empty()) {
+        for (auto &P : PotentiallyReorderedEntriesCount) {
+          auto *It = find(P.first->Scalars, User);
+          assert(It != P.first->Scalars.end() &&
+                 "User is not in the tree entry");
+          int Lane = std::distance(P.first->Scalars.begin(), It);
+          assert(Lane >= 0 && "Lane is not found");
+          if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
+            Lane = P.first->ReorderIndices[Lane];
+          assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
+                 "Couldn't find extract lane");
+          SmallVector<unsigned> OpIndices;
+          for (unsigned OpIdx :
+               seq<unsigned>(::getNumberOfPotentiallyCommutativeOps(
+                   P.first->getMainOp()))) {
+            if (P.first->getOperand(OpIdx)[Lane] == Op &&
+                getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
+              --P.getSecond();
+          }
+        }
+        return all_of(PotentiallyReorderedEntriesCount,
+                      [&](const std::pair<const TreeEntry *, unsigned> &P) {
+                        return P.second == NumOps - 1;
+                      });
+      }
+      return true;
+    }
+
+    SmallVector<ScheduleCopyableData *>
+    getScheduleCopyableData(const Instruction *I) const {
+      if (ScheduleCopyableDataMapByInst.empty())
+        return {};
+      const auto It = ScheduleCopyableDataMapByInst.find(I);
+      if (It == ScheduleCopyableDataMapByInst.end())
+        return {};
+      SmallVector<ScheduleCopyableData *> Res;
+      for (ScheduleCopyableData *SD : It->getSecond()) {
+        if (isInSchedulingRegion(*SD))
+          Res.push_back(SD);
+      }
+      return Res;
+    }
+
+    SmallVector<ScheduleCopyableData *>
+    getScheduleCopyableDataUsers(const Instruction *User) const {
+      if (ScheduleCopyableDataMapByUsers.empty())
+        return {};
+      const auto It = ScheduleCopyableDataMapByUsers.find(User);
+      if (It == ScheduleCopyableDataMapByUsers.end())
+        return {};
+      SmallVector<ScheduleCopyableData *> Res;
+      for (ScheduleCopyableData *SD : It->getSecond()) {
+        if (isInSchedulingRegion(*SD))
+          Res.push_back(SD);
+      }
+      return Res;
+    }
+
+    ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
+                                                  Instruction *I,
+                                                  int SchedulingRegionID,
+                                                  ScheduleBundle &Bundle) {
+      assert(!getScheduleCopyableData(EI, I) && "already in the map");
+      ScheduleCopyableData *CD =
+          ScheduleCopyableDataMap
+              .try_emplace(std::make_pair(EI, I),
+                           std::make_unique<ScheduleCopyableData>(
+                               SchedulingRegionID, I, EI, Bundle))
+              .first->getSecond()
+              .get();
+      ScheduleCopyableDataMapByInst[I].push_back(CD);
+      if (EI.UserTE) {
+        ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
+        const auto *It = find(Op, I);
+        assert(It != Op.end() && "Lane not set");
+        SmallPtrSet<Instruction *, 4> Visited;
+        do {
+          int Lane = std::distance(Op.begin(), It);
+          assert(Lane >= 0 && "Lane not set");
+          if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
+              !EI.UserTE->ReorderIndices.empty())
+            Lane = EI.UserTE->ReorderIndices[Lane];
+          assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
+                 "Couldn't find extract lane");
+          auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
+          if (!Visited.insert(In).second) {
+            It = find(make_range(std::next(It), Op.end()), I);
+            continue;
+          }
+          ScheduleCopyableDataMapByInstUser
+              .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
+              .first->getSecond()
+              .push_back(CD);
+          ScheduleCopyableDataMapByUsers.try_emplace(I)
+              .first->getSecond()
+              .insert(CD);
+          // Remove extra deps for users, becoming non-immediate users of the
+          // instruction. It may happen, if the chain of same copyable elements
+          // appears in the tree.
+          if (In == I) {
+            EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
+            if (ScheduleCopyableData *UserCD =
+                    getScheduleCopyableData(UserEI, In))
+              ScheduleCopyableDataMapByUsers[I].remove(UserCD);
+          }
+          It = find(make_range(std::next(It), Op.end()), I);
+        } while (It != Op.end());
+      } else {
+        ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
+            CD);
+      }
+      return *CD;
+    }
+
     ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
       auto *I = dyn_cast<Instruction>(V);
       if (!I)
@@ -4954,34 +5385,44 @@ private:
       return It->getSecond();
     }
 
-    bool isInSchedulingRegion(ScheduleData *SD) const {
-      return SD->getSchedulingRegionID() == SchedulingRegionID;
-    }
-
-    bool isInSchedulingRegion(const ScheduleBundle &Bundle) const {
-      return all_of(Bundle.getBundle(), [&](const ScheduleData *BundleMember) {
-        return BundleMember->getSchedulingRegionID() == SchedulingRegionID;
-      });
+    /// Returns true if the entity is in the scheduling region.
+    bool isInSchedulingRegion(const ScheduleEntity &SD) const {
+      if (const auto *Data = dyn_cast<ScheduleData>(&SD))
+        return Data->getSchedulingRegionID() == SchedulingRegionID;
+      if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
+        return CD->getSchedulingRegionID() == SchedulingRegionID;
+      return all_of(cast<ScheduleBundle>(SD).getBundle(),
+                    [&](const ScheduleEntity *BundleMember) {
+                      return isInSchedulingRegion(*BundleMember);
+                    });
     }
 
     /// Marks an instruction as scheduled and puts all dependent ready
     /// instructions into the ready-list.
     template <typename ReadyListType>
-    void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) {
-      auto ProcessBundleMember = [&](ScheduleData *BundleMember,
-                                     ScheduleBundle *Bundle) {
+    void schedule(const BoUpSLP &R, const InstructionsState &S,
+                  const EdgeInfo &EI, ScheduleEntity *Data,
+                  ReadyListType &ReadyList) {
+      auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
+                                     ArrayRef<ScheduleBundle *> Bundles) {
         // Handle the def-use chain dependencies.
 
         // Decrement the unscheduled counter and insert to ready list if ready.
-        auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) {
+        auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
           if ((IsControl || Data->hasValidDependencies()) &&
               Data->incrementUnscheduledDeps(-1) == 0) {
             // There are no more unscheduled dependencies after
             // decrementing, so we can put the dependent instruction
             // into the ready list.
-            if (ArrayRef<ScheduleBundle *> Bundles =
-                    getScheduleBundles(Data->getInst());
-                !Bundles.empty()) {
+            SmallVector<ScheduleBundle *, 1> CopyableBundle;
+            ArrayRef<ScheduleBundle *> Bundles;
+            if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
+              CopyableBundle.push_back(&CD->getBundle());
+              Bundles = CopyableBundle;
+            } else {
+              Bundles = getScheduleBundles(Data->getInst());
+            }
+            if (!Bundles.empty()) {
               for (ScheduleBundle *Bundle : Bundles) {
                 if (Bundle->unscheduledDepsInBundle() == 0) {
                   assert(!Bundle->isScheduled() &&
@@ -4995,12 +5436,23 @@ private:
             }
             assert(!Data->isScheduled() &&
                    "already scheduled bundle gets ready");
+            assert(!isa<ScheduleCopyableData>(Data) &&
+                   "Expected non-copyable data");
             ReadyList.insert(Data);
             LLVM_DEBUG(dbgs() << "SLP:    gets ready: " << *Data << "\n");
           }
         };
 
-        auto DecrUnschedForInst = [&](Instruction *I) {
+        auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
+                                      Instruction *I) {
+          if (!ScheduleCopyableDataMap.empty()) {
+            SmallVector<ScheduleCopyableData *> CopyableData =
+                getScheduleCopyableData(User, OpIdx, I);
+            for (ScheduleCopyableData *CD : CopyableData)
+              DecrUnsched(CD, /*IsControl=*/false);
+            if (!CopyableData.empty())
+              return;
+          }
           if (ScheduleData *OpSD = getScheduleData(I))
             DecrUnsched(OpSD, /*IsControl=*/false);
         };
@@ -5008,45 +5460,101 @@ private:
         // If BundleMember is a vector bundle, its operands may have been
         // reordered during buildTree(). We therefore need to get its operands
         // through the TreeEntry.
-        if (Bundle) {
-          // Need to search for the lane since the tree entry can be reordered.
+        if (!Bundles.empty()) {
           auto *In = BundleMember->getInst();
-          int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
-                                   find(Bundle->getTreeEntry()->Scalars, In));
-          assert(Lane >= 0 && "Lane not set");
-
-          // Since vectorization tree is being built recursively this assertion
-          // ensures that the tree entry has all operands set before reaching
-          // this code. Couple of exceptions known at the moment are extracts
-          // where their second (immediate) operand is not added. Since
-          // immediates do not affect scheduler behavior this is considered
-          // okay.
-          assert(In &&
-                 (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
-                  In->getNumOperands() ==
-                      Bundle->getTreeEntry()->getNumOperands()) &&
-                 "Missed TreeEntry operands?");
-
-          for (unsigned OpIdx :
-               seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
-            if (auto *I = dyn_cast<Instruction>(
-                    Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
-              LLVM_DEBUG(dbgs()
-                         << "SLP:   check for readiness (def): " << *I << "\n");
-              DecrUnschedForInst(I);
+          // Count uses of each instruction operand.
+          SmallDenseMap<const Instruction *, unsigned> OperandsUses;
+          unsigned TotalOpCount = 0;
+          if (isa<ScheduleCopyableData>(BundleMember)) {
+            // Copyable data is used only once (uses itself).
+            TotalOpCount = OperandsUses[In] = 1;
+          } else {
+            for (const Use &U : In->operands()) {
+              if (auto *I = dyn_cast<Instruction>(U.get())) {
+                auto Res = OperandsUses.try_emplace(I, 0);
+                ++Res.first->getSecond();
+                ++TotalOpCount;
+              }
+            }
+          }
+          // Decrement the unscheduled counter and insert to ready list if
+          // ready.
+          auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
+                                        unsigned OpIdx) {
+            if (!ScheduleCopyableDataMap.empty()) {
+              const EdgeInfo EI = {UserTE, OpIdx};
+              if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
+                DecrUnsched(CD, /*IsControl=*/false);
+                return;
+              }
+            }
+            auto It = OperandsUses.find(I);
+            assert(It != OperandsUses.end() && "Operand not found");
+            if (It->second > 0) {
+              --It->getSecond();
+              assert(TotalOpCount > 0 && "No more operands to decrement");
+              --TotalOpCount;
+              if (ScheduleData *OpSD = getScheduleData(I))
+                DecrUnsched(OpSD, /*IsControl=*/false);
             }
+          };
+
+          for (ScheduleBundle *Bundle : Bundles) {
+            if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
+              break;
+            // Need to search for the lane since the tree entry can be
+            // reordered.
+            int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
+                                     find(Bundle->getTreeEntry()->Scalars, In));
+            assert(Lane >= 0 && "Lane not set");
+            if (isa<StoreInst>(In) &&
+                !Bundle->getTreeEntry()->ReorderIndices.empty())
+              Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
+            assert(Lane < static_cast<int>(
+                              Bundle->getTreeEntry()->Scalars.size()) &&
+                   "Couldn't find extract lane");
+
+            // Since vectorization tree is being built recursively this
+            // assertion ensures that the tree entry has all operands set before
+            // reaching this code. Couple of exceptions known at the moment are
+            // extracts where their second (immediate) operand is not added.
+            // Since immediates do not affect scheduler behavior this is
+            // considered okay.
+            assert(In &&
+                   (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
+                    In->getNumOperands() ==
+                        Bundle->getTreeEntry()->getNumOperands() ||
+                    Bundle->getTreeEntry()->isCopyableElement(In)) &&
+                   "Missed TreeEntry operands?");
+
+            for (unsigned OpIdx :
+                 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
+              if (auto *I = dyn_cast<Instruction>(
+                      Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
+                LLVM_DEBUG(dbgs() << "SLP:   check for readiness (def): " << *I
+                                  << "\n");
+                DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
+              }
+          }
         } else {
           // If BundleMember is a stand-alone instruction, no operand reordering
           // has taken place, so we directly access its operands.
-          for (Use &U : BundleMember->getInst()->operands())
+          for (Use &U : BundleMember->getInst()->operands()) {
             if (auto *I = dyn_cast<Instruction>(U.get())) {
               LLVM_DEBUG(dbgs()
                          << "SLP:   check for readiness (def): " << *I << "\n");
-              DecrUnschedForInst(I);
+              DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
             }
+          }
         }
         // Handle the memory dependencies.
-        for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) {
+        auto *SD = dyn_cast<ScheduleData>(BundleMember);
+        if (!SD)
+          return;
+        SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
+        for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
+          if (!VisitedMemory.insert(MemoryDep).second)
+            continue;
           // There are no more unscheduled dependencies after decrementing,
           // so we can put the dependent instruction into the ready list.
           LLVM_DEBUG(dbgs() << "SLP:   check for readiness (mem): "
@@ -5054,7 +5562,10 @@ private:
           DecrUnsched(MemoryDep);
         }
         // Handle the control dependencies.
-        for (ScheduleData *Dep : BundleMember->getControlDependencies()) {
+        SmallPtrSet<const ScheduleData *, 4> VisitedControl;
+        for (ScheduleData *Dep : SD->getControlDependencies()) {
+          if (!VisitedControl.insert(Dep).second)
+            continue;
           // There are no more unscheduled dependencies after decrementing,
           // so we can put the dependent instruction into the ready list.
           LLVM_DEBUG(dbgs()
@@ -5065,23 +5576,29 @@ private:
       if (auto *SD = dyn_cast<ScheduleData>(Data)) {
         SD->setScheduled(/*Scheduled=*/true);
         LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
-        ProcessBundleMember(SD, nullptr);
+        ProcessBundleMember(SD, {});
       } else {
         ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
         Bundle.setScheduled(/*Scheduled=*/true);
         LLVM_DEBUG(dbgs() << "SLP:   schedule " << Bundle << "\n");
-        auto AreAllBundlesScheduled = [&](const ScheduleData *SD) {
-          ArrayRef<ScheduleBundle *> SDBundles =
-              getScheduleBundles(SD->getInst());
-          return !SDBundles.empty() &&
-                 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
-                   return SDBundle->isScheduled();
-                 });
-        };
-        for (ScheduleData *SD : Bundle.getBundle()) {
-          if (AreAllBundlesScheduled(SD)) {
+        auto AreAllBundlesScheduled =
+            [&](const ScheduleEntity *SD,
+                ArrayRef<ScheduleBundle *> SDBundles) {
+              if (isa<ScheduleCopyableData>(SD))
+                return true;
+              return !SDBundles.empty() &&
+                     all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
+                       return SDBundle->isScheduled();
+                     });
+            };
+        for (ScheduleEntity *SD : Bundle.getBundle()) {
+          ArrayRef<ScheduleBundle *> SDBundles;
+          if (!isa<ScheduleCopyableData>(SD))
+            SDBundles = getScheduleBundles(SD->getInst());
+          if (AreAllBundlesScheduled(SD, SDBundles)) {
             SD->setScheduled(/*Scheduled=*/true);
-            ProcessBundleMember(SD, &Bundle);
+            ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
+                                                                  : SDBundles);
           }
         }
       }
@@ -5109,7 +5626,7 @@ private:
         auto *SD = getScheduleData(I);
         if (!SD)
           continue;
-        assert(isInSchedulingRegion(SD) &&
+        assert(isInSchedulingRegion(*SD) &&
                "primary schedule data not in window?");
         SD->verify();
       }
@@ -5150,8 +5667,11 @@ private:
 
     /// Build a bundle from the ScheduleData nodes corresponding to the
     /// scalar instruction for each lane.
+    /// \param VL The list of scalar instructions.
+    /// \param S The state of the instructions.
+    /// \param EI The edge in the SLP graph or the user node/operand number.
     ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
-                                const InstructionsState &S);
+                                const InstructionsState &S, const EdgeInfo &EI);
 
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
@@ -5160,7 +5680,7 @@ private:
     /// std::nullopt if \p VL is allowed to be scheduled.
     std::optional<ScheduleBundle *>
     tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                      const InstructionsState &S);
+                      const InstructionsState &S, const EdgeInfo &EI);
 
     /// Allocates schedule data chunk.
     ScheduleData *allocateScheduleDataChunks();
@@ -5178,7 +5698,8 @@ private:
     /// Updates the dependency information of a bundle and of all instructions/
     /// bundles which depend on the original bundle.
     void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
-                               BoUpSLP *SLP);
+                               BoUpSLP *SLP,
+                               ArrayRef<ScheduleData *> ControlDeps = {});
 
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
@@ -5200,6 +5721,48 @@ private:
     /// ScheduleData structures are recycled.
     SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
 
+    /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
+    /// number) and the operand instruction, represented as copyable element.
+    SmallDenseMap<std::pair<EdgeInfo, const Value *>,
+                  std::unique_ptr<ScheduleCopyableData>>
+        ScheduleCopyableDataMap;
+
+    /// Represents mapping between instruction and all related
+    /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
+    /// element). The SLP tree may contain several representations of the same
+    /// instruction.
+    SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
+        ScheduleCopyableDataMapByInst;
+
+    /// Represents mapping between user value and operand number, the operand
+    /// value and all related ScheduleCopyableData. The relation is 1:n, because
+    /// the same user may refernce the same operand in different tree entries
+    /// and the operand may be modelled by the different copyable data element.
+    SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
+                  SmallVector<ScheduleCopyableData *>>
+        ScheduleCopyableDataMapByInstUser;
+
+    /// Represents mapping between instruction and all related
+    /// ScheduleCopyableData. It represents the mapping between the actual
+    /// instruction and the last copyable data element in the chain. E.g., if
+    /// the graph models the following instructions:
+    /// %0 = non-add instruction ...
+    /// ...
+    /// %4 = add %3, 1
+    /// %5 = add %4, 1
+    /// %6 = insertelement poison, %0, 0
+    /// %7 = insertelement %6, %5, 1
+    /// And the graph is modeled as:
+    /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
+    ///          -> [1, 0]                 -> [%1, 0]
+    ///
+    /// this map will map %0 only to the copyable element <1>, which is the last
+    /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
+    /// keep the map to <0>, not the %0.
+    SmallDenseMap<const Instruction *,
+                  SmallSetVector<ScheduleCopyableData *, 4>>
+        ScheduleCopyableDataMapByUsers;
+
     /// Attaches ScheduleBundle to Instruction.
     SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
         ScheduledBundles;
@@ -5246,7 +5809,7 @@ private:
 
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
-  void scheduleBlock(BlockScheduling *BS);
+  void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
 
   /// List of users to ignore during scheduling and that don't need extracting.
   const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
@@ -5319,6 +5882,30 @@ private:
 
 } // end namespace slpvectorizer
 
+template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
+  using FirstInfo = DenseMapInfo<BoUpSLP::TreeEntry *>;
+  using SecondInfo = DenseMapInfo<unsigned>;
+  static BoUpSLP::EdgeInfo getEmptyKey() {
+    return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
+                             SecondInfo::getEmptyKey());
+  }
+
+  static BoUpSLP::EdgeInfo getTombstoneKey() {
+    return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
+                             SecondInfo::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
+    return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
+                                    SecondInfo::getHashValue(Val.EdgeIdx));
+  }
+
+  static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
+                      const BoUpSLP::EdgeInfo &RHS) {
+    return LHS == RHS;
+  }
+};
+
 template <> struct GraphTraits<BoUpSLP *> {
   using TreeEntry = BoUpSLP::TreeEntry;
 
@@ -7195,12 +7782,45 @@ bool BoUpSLP::isProfitableToReorder() const {
     // Check if the tree has only single store and single (unordered) load node,
     // other nodes are phis or geps/binops, combined with phis, and/or single
     // gather load node
-    bool HasPhis = false;
     if (VectorizableTree.front()->hasState() &&
         VectorizableTree.front()->getOpcode() == Instruction::PHI &&
         VectorizableTree.front()->Scalars.size() == TinyVF &&
         VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
       return false;
+    // Single node, which require reorder - skip.
+    if (VectorizableTree.front()->hasState() &&
+        VectorizableTree.front()->getOpcode() == Instruction::Store &&
+        VectorizableTree.front()->ReorderIndices.empty()) {
+      const unsigned ReorderedSplitsCnt =
+          count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+            return TE->State == TreeEntry::SplitVectorize &&
+                   !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
+                   TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
+                   ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
+          });
+      if (ReorderedSplitsCnt <= 1 &&
+          static_cast<unsigned>(count_if(
+              VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+                return ((!TE->isGather() &&
+                         (TE->ReorderIndices.empty() ||
+                          (TE->UserTreeIndex.UserTE &&
+                           TE->UserTreeIndex.UserTE->State ==
+                               TreeEntry::Vectorize &&
+                           !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
+                                .empty()))) ||
+                        (TE->isGather() && TE->ReorderIndices.empty() &&
+                         (!TE->hasState() || TE->isAltShuffle() ||
+                          TE->getOpcode() == Instruction::Load ||
+                          TE->getOpcode() == Instruction::ZExt ||
+                          TE->getOpcode() == Instruction::SExt))) &&
+                       (VectorizableTree.front()->getVectorFactor() > TinyVF ||
+                        !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
+                          return !isConstant(V) && isVectorized(V);
+                        }));
+              })) >= VectorizableTree.size() - ReorderedSplitsCnt)
+        return false;
+    }
+    bool HasPhis = false;
     bool HasLoad = true;
     unsigned GatherLoads = 0;
     for (const std::unique_ptr<TreeEntry> &TE :
@@ -9772,7 +10392,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
        }))) {
     if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
         S.getMainOp()->isSafeToRemove() &&
-        all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
+        (S.areInstructionsWithCopyableElements() ||
+         all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
       // Find the number of elements, which forms full vectors.
       unsigned PWSz = getFullVectorNumberOfElements(
           TTI, UniqueValues.front()->getType(), UniqueValues.size());
@@ -9789,8 +10410,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
         PaddedUniqueValues.append(
             PWSz - UniqueValues.size(),
             PoisonValue::get(UniqueValues.front()->getType()));
-        // Check that extended with poisons operations are still valid for
-        // vectorization (div/rem are not allowed).
+        // Check that extended with poisons/copyable operations are still valid
+        // for vectorization (div/rem are not allowed).
         if (!S.areInstructionsWithCopyableElements() &&
             !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
           LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
@@ -9952,35 +10573,41 @@ class InstructionsCompatibilityAnalysis {
   unsigned MainOpcode = 0;
   Instruction *MainOp = nullptr;
 
+  /// Checks if the opcode is supported as the main opcode for copyable
+  /// elements.
+  static bool isSupportedOpcode(const unsigned Opcode) {
+    return Opcode == Instruction::Add || Opcode == Instruction::LShr;
+  }
+
   /// Identifies the best candidate value, which represents main opcode
   /// operation.
   /// Currently the best candidate is the Add instruction with the parent
   /// block with the highest DFS incoming number (block, that dominates other).
-  void findAndSetMainInstruction(ArrayRef<Value *> VL) {
+  void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
     BasicBlock *Parent = nullptr;
     // Checks if the instruction has supported opcode.
-    auto IsSupportedOpcode = [](Instruction *I) {
-      return I && I->getOpcode() == Instruction::Add;
+    auto IsSupportedInstruction = [&](Instruction *I) {
+      return I && isSupportedOpcode(I->getOpcode()) &&
+             (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
     };
+    // Exclude operands instructions immediately to improve compile time, it
+    // will be unable to schedule anyway.
     SmallDenseSet<Value *, 8> Operands;
+    SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
     for (Value *V : VL) {
       auto *I = dyn_cast<Instruction>(V);
       if (!I)
         continue;
       if (!DT.isReachableFromEntry(I->getParent()))
         continue;
-      if (!MainOp) {
-        MainOp = I;
+      if (Candidates.empty()) {
+        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
         Parent = I->getParent();
         Operands.insert(I->op_begin(), I->op_end());
         continue;
       }
       if (Parent == I->getParent()) {
-        if (!IsSupportedOpcode(MainOp))
-          MainOp = I;
-        if (MainOp->getOpcode() == I->getOpcode() &&
-            doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I))
-          MainOp = I;
+        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
         Operands.insert(I->op_begin(), I->op_end());
         continue;
       }
@@ -9992,24 +10619,35 @@ class InstructionsCompatibilityAnalysis {
                  (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
              "Different nodes should have different DFS numbers");
       if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
-        MainOp = I;
+        Candidates.clear();
+        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
         Parent = I->getParent();
         Operands.clear();
         Operands.insert(I->op_begin(), I->op_end());
       }
     }
-    if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
-      MainOp = nullptr;
-      return;
+    unsigned BestOpcodeNum = 0;
+    MainOp = nullptr;
+    for (const auto &P : Candidates) {
+      if (P.second.size() < BestOpcodeNum)
+        continue;
+      for (Instruction *I : P.second) {
+        if (IsSupportedInstruction(I) && !Operands.contains(I)) {
+          MainOp = I;
+          BestOpcodeNum = P.second.size();
+          break;
+        }
+      }
     }
-    MainOpcode = MainOp->getOpcode();
+    if (MainOp)
+      MainOpcode = MainOp->getOpcode();
   }
 
   /// Returns the idempotent value for the \p MainOp with the detected \p
   /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
   /// the operand itself, since V or V == V.
   Value *selectBestIdempotentValue() const {
-    assert(MainOpcode == Instruction::Add && "Unsupported opcode");
+    assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
     return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
                                           !MainOp->isCommutative());
   }
@@ -10022,13 +10660,8 @@ class InstructionsCompatibilityAnalysis {
       return {V, V};
     if (!S.isCopyableElement(V))
       return convertTo(cast<Instruction>(V), S).second;
-    switch (MainOpcode) {
-    case Instruction::Add:
-      return {V, selectBestIdempotentValue()};
-    default:
-      break;
-    }
-    llvm_unreachable("Unsupported opcode");
+    assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
+    return {V, selectBestIdempotentValue()};
   }
 
   /// Builds operands for the original instructions.
@@ -10202,16 +10835,10 @@ public:
       return S;
     if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
       return S;
-    findAndSetMainInstruction(VL);
+    findAndSetMainInstruction(VL, R);
     if (!MainOp)
       return InstructionsState::invalid();
     S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
-    // TODO: Remove this check once support for schulable copyables is landed.
-    if (any_of(VL, [&](Value *V) {
-          return S.isCopyableElement(V) && !S.isNonSchedulable(V);
-        }))
-      return InstructionsState::invalid();
-
     if (!WithProfitabilityCheck)
       return S;
     // Check if it is profitable to vectorize the instruction.
@@ -10247,6 +10874,21 @@ public:
       }
       if (!Res)
         return InstructionsState::invalid();
+      constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
+      InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
+      InstructionCost VectorCost;
+      FixedVectorType *VecTy =
+          getWidenedType(S.getMainOp()->getType(), VL.size());
+      switch (MainOpcode) {
+      case Instruction::Add:
+      case Instruction::LShr:
+        VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
+        break;
+      default:
+        llvm_unreachable("Unexpected instruction.");
+      }
+      if (VectorCost > ScalarCost)
+        return InstructionsState::invalid();
       return S;
     }
     assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -10731,7 +11373,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
 
   SetVector<Value *> UniqueValues(llvm::from_range, VL);
   std::optional<ScheduleBundle *> BundlePtr =
-      BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S);
+      BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
 #ifdef EXPENSIVE_CHECKS
   // Make sure we didn't break any internal invariants
   BS.verify();
@@ -11991,6 +12633,8 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
   }
 }
 
+/// Check if we can convert fadd/fsub sequence to FMAD.
+/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
 static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
                                        const InstructionsState &S,
                                        DominatorTree &DT, const DataLayout &DL,
@@ -12010,7 +12654,8 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
       auto *I = dyn_cast<Instruction>(V);
       if (!I)
         continue;
-      // TODO: support for copyable elements.
+      if (S.isCopyableElement(I))
+        continue;
       Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
       if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
         continue;
@@ -12028,6 +12673,7 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
   InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
   if (!OpS.valid())
     return InstructionCost::getInvalid();
+
   if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
     return InstructionCost::getInvalid();
   if (!CheckForContractable(Operands.front()))
@@ -12042,15 +12688,19 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
     auto *I = dyn_cast<Instruction>(V);
     if (!I)
       continue;
-    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
-      FMF &= FPCI->getFastMathFlags();
+    if (!S.isCopyableElement(I))
+      if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+        FMF &= FPCI->getFastMathFlags();
     FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
   }
   unsigned NumOps = 0;
   for (auto [V, Op] : zip(VL, Operands.front())) {
+    if (S.isCopyableElement(V))
+      continue;
     auto *I = dyn_cast<Instruction>(Op);
-    if (!I || !I->hasOneUse()) {
-      FMACost += TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+    if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
+      if (auto *OpI = dyn_cast<Instruction>(V))
+        FMACost += TTI.getInstructionCost(OpI, CostKind);
       if (I)
         FMACost += TTI.getInstructionCost(I, CostKind);
       continue;
@@ -14687,6 +15337,31 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
          })))))
     return true;
 
+  // If the tree contains only buildvector, 2 non-buildvectors (with root user
+  // tree node) and other buildvectors, we can skip it.
+  if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
+      VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
+      VectorizableTree.size() >= Limit &&
+      count_if(ArrayRef(VectorizableTree).drop_front(),
+               [&](const std::unique_ptr<TreeEntry> &TE) {
+                 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
+                        TE->UserTreeIndex.UserTE->Idx == 0;
+               }) == 2)
+    return true;
+
+  // If the tree contains only vectorization of the phi node from the
+  // buildvector - skip it.
+  if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
+      VectorizableTree.size() > 2 &&
+      VectorizableTree.front()->State == TreeEntry::Vectorize &&
+      VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
+      VectorizableTree[1]->State == TreeEntry::Vectorize &&
+      VectorizableTree[1]->getOpcode() == Instruction::PHI &&
+      all_of(
+          ArrayRef(VectorizableTree).drop_front(2),
+          [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
+    return true;
+
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
   if (VectorizableTree.size() >= MinTreeSize)
@@ -19234,7 +19909,7 @@ Value *BoUpSLP::vectorizeTree(
   EntryToLastInstruction.clear();
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules)
-    scheduleBlock(BSIter.second.get());
+    scheduleBlock(*this, BSIter.second.get());
   // Cache last instructions for the nodes to avoid side effects, which may
   // appear during vectorization, like extra uses, etc.
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
@@ -20041,24 +20716,29 @@ void BoUpSLP::optimizeGatherSequence() {
   GatherShuffleExtractSeq.clear();
 }
 
-BoUpSLP::ScheduleBundle &
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
-                                      const InstructionsState &S) {
+BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
+    ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
   auto &BundlePtr =
       ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
   for (Value *V : VL) {
-    if (doesNotNeedToBeScheduled(V))
+    if (S.isNonSchedulable(V))
       continue;
-    if (S.isCopyableElement(V))
+    auto *I = cast<Instruction>(V);
+    if (S.isCopyableElement(V)) {
+      // Add a copyable element model.
+      ScheduleCopyableData &SD =
+          addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
+      // Group the instructions to a bundle.
+      BundlePtr->add(&SD);
       continue;
+    }
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember && "no ScheduleData for bundle member "
                            "(maybe not in same basic block)");
     // Group the instructions to a bundle.
     BundlePtr->add(BundleMember);
-    ScheduledBundles.try_emplace(cast<Instruction>(V))
-        .first->getSecond()
-        .push_back(BundlePtr.get());
+    ScheduledBundles.try_emplace(I).first->getSecond().push_back(
+        BundlePtr.get());
   }
   assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
   return *BundlePtr;
@@ -20068,7 +20748,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
 // and schedules instructions until the bundle gets ready.
 std::optional<BoUpSLP::ScheduleBundle *>
 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                                            const InstructionsState &S) {
+                                            const InstructionsState &S,
+                                            const EdgeInfo &EI) {
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
   // instructions.
   bool HasCopyables = S.areInstructionsWithCopyableElements();
@@ -20078,33 +20759,83 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
       all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
     return nullptr;
 
-  // TODO Remove once full support for copyables is landed.
-  assert(all_of(VL,
-                [&](Value *V) {
-                  return !S.isCopyableElement(V) || S.isNonSchedulable(V);
-                }) &&
-         "Copyable elements should not be schedulable");
   // Initialize the instruction bundle.
   Instruction *OldScheduleEnd = ScheduleEnd;
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.getMainOp() << "\n");
 
   auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
+    // Clear deps or recalculate the region, if the memory instruction is a
+    // copyable. It may have memory deps, which must be recalculated.
+    SmallVector<ScheduleData *> ControlDependentMembers;
+    auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
+      SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
+      for (ScheduleEntity *SE : Bundle.getBundle()) {
+        if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
+          if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
+              BundleMember && BundleMember->hasValidDependencies()) {
+            BundleMember->clearDirectDependencies();
+            if (RegionHasStackSave ||
+                !isGuaranteedToTransferExecutionToSuccessor(
+                    BundleMember->getInst()))
+              ControlDependentMembers.push_back(BundleMember);
+          }
+          continue;
+        }
+        auto *SD = cast<ScheduleData>(SE);
+        for (const Use &U : SD->getInst()->operands()) {
+          unsigned &NumOps =
+              UserOpToNumOps
+                  .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
+                  .first->getSecond();
+          ++NumOps;
+          if (auto *Op = dyn_cast<Instruction>(U.get());
+              Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
+                                                         *SLP, NumOps)) {
+            if (ScheduleData *OpSD = getScheduleData(Op)) {
+              OpSD->clearDirectDependencies();
+              if (RegionHasStackSave ||
+                  !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
+                ControlDependentMembers.push_back(OpSD);
+            }
+          }
+        }
+      }
+    };
     // The scheduling region got new instructions at the lower end (or it is a
     // new region for the first bundle). This makes it necessary to
     // recalculate all dependencies.
     // It is seldom that this needs to be done a second time after adding the
     // initial bundle to the region.
     if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
-      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-        if (ScheduleData *SD = getScheduleData(I))
+      for_each(ScheduleDataMap, [&](auto &P) {
+        if (BB != P.first->getParent())
+          return;
+        ScheduleData *SD = P.second;
+        if (isInSchedulingRegion(*SD))
           SD->clearDependencies();
-      }
+      });
+      for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
+        for_each(P.second, [&](ScheduleCopyableData *SD) {
+          if (isInSchedulingRegion(*SD))
+            SD->clearDependencies();
+        });
+      });
       ReSchedule = true;
     }
+    // Check if the bundle data has deps for copyable elements already. In
+    // this case need to reset deps and recalculate it.
     if (Bundle && !Bundle.getBundle().empty()) {
+      if (S.areInstructionsWithCopyableElements() ||
+          !ScheduleCopyableDataMap.empty())
+        CheckIfNeedToClearDeps(Bundle);
       LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
                         << BB->getName() << "\n");
-      calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP);
+      calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
+                            ControlDependentMembers);
+    } else if (!ControlDependentMembers.empty()) {
+      ScheduleBundle Invalid = ScheduleBundle::invalid();
+      calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
+                            ControlDependentMembers);
     }
 
     if (ReSchedule) {
@@ -20120,7 +20851,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
            !ReadyInsts.empty()) {
       ScheduleEntity *Picked = ReadyInsts.pop_back_val();
       assert(Picked->isReady() && "must be ready to schedule");
-      schedule(Picked, ReadyInsts);
+      schedule(*SLP, S, EI, Picked, ReadyInsts);
       if (Picked == &Bundle)
         break;
     }
@@ -20129,7 +20860,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
   for (Value *V : VL) {
-    if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
+    if (S.isNonSchedulable(V))
       continue;
     if (!extendSchedulingRegion(V, S)) {
       // If the scheduling region got new instructions at the lower end (or it
@@ -20146,11 +20877,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
   bool ReSchedule = false;
   for (Value *V : VL) {
-    if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
+    if (S.isNonSchedulable(V))
       continue;
+    SmallVector<ScheduleCopyableData *> CopyableData =
+        getScheduleCopyableData(cast<Instruction>(V));
+    if (!CopyableData.empty()) {
+      for (ScheduleCopyableData *SD : CopyableData)
+        ReadyInsts.remove(SD);
+    }
     ScheduleData *BundleMember = getScheduleData(V);
-    assert(BundleMember &&
+    assert((BundleMember || S.isCopyableElement(V)) &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
+    if (!BundleMember)
+      continue;
 
     // Make sure we don't leave the pieces of the bundle in the ready list when
     // whole bundle might not be ready.
@@ -20161,20 +20900,25 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
         ReadyInsts.remove(B);
     }
 
-    if (!BundleMember->isScheduled())
+    if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
       continue;
     // A bundle member was scheduled as single instruction before and now
     // needs to be scheduled as part of the bundle. We just get rid of the
     // existing schedule.
+    // A bundle member has deps calculated before it was copyable element - need
+    // to reschedule.
     LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
                       << " was already scheduled\n");
     ReSchedule = true;
   }
 
-  ScheduleBundle &Bundle = buildBundle(VL, S);
+  ScheduleBundle &Bundle = buildBundle(VL, S, EI);
   TryScheduleBundleImpl(ReSchedule, Bundle);
   if (!Bundle.isReady()) {
-    for (ScheduleData *BD : Bundle.getBundle()) {
+    for (ScheduleEntity *BD : Bundle.getBundle()) {
+      // Copyable data scheduling is just removed.
+      if (isa<ScheduleCopyableData>(BD))
+        continue;
       if (BD->isReady()) {
         ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
         if (Bundles.empty()) {
@@ -20187,10 +20931,66 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
       }
     }
     ScheduledBundlesList.pop_back();
+    SmallVector<ScheduleData *> ControlDependentMembers;
+    SmallPtrSet<Instruction *, 4> Visited;
     for (Value *V : VL) {
-      if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
+      if (S.isNonSchedulable(V))
         continue;
-      ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back();
+      auto *I = cast<Instruction>(V);
+      if (S.isCopyableElement(I)) {
+        // Remove the copyable data from the scheduling region and restore
+        // previous mappings.
+        auto KV = std::make_pair(EI, I);
+        assert(ScheduleCopyableDataMap.contains(KV) &&
+               "no ScheduleCopyableData for copyable element");
+        ScheduleCopyableData *SD =
+            ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
+        ScheduleCopyableDataMapByUsers[I].remove(SD);
+        if (EI.UserTE) {
+          ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
+          const auto *It = find(Op, I);
+          assert(It != Op.end() && "Lane not set");
+          SmallPtrSet<Instruction *, 4> Visited;
+          do {
+            int Lane = std::distance(Op.begin(), It);
+            assert(Lane >= 0 && "Lane not set");
+            if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
+                !EI.UserTE->ReorderIndices.empty())
+              Lane = EI.UserTE->ReorderIndices[Lane];
+            assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
+                   "Couldn't find extract lane");
+            auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
+            if (!Visited.insert(In).second) {
+              It = find(make_range(std::next(It), Op.end()), I);
+              break;
+            }
+            ScheduleCopyableDataMapByInstUser
+                [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
+                    .pop_back();
+            It = find(make_range(std::next(It), Op.end()), I);
+          } while (It != Op.end());
+          EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
+          if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
+            ScheduleCopyableDataMapByUsers[I].insert(UserCD);
+        }
+        if (ScheduleCopyableDataMapByUsers[I].empty())
+          ScheduleCopyableDataMapByUsers.erase(I);
+        ScheduleCopyableDataMap.erase(KV);
+        // Need to recalculate dependencies for the actual schedule data.
+        if (ScheduleData *OpSD = getScheduleData(I)) {
+          OpSD->clearDirectDependencies();
+          if (RegionHasStackSave ||
+              !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
+            ControlDependentMembers.push_back(OpSD);
+        }
+        continue;
+      }
+      ScheduledBundles.find(I)->getSecond().pop_back();
+    }
+    if (!ControlDependentMembers.empty()) {
+      ScheduleBundle Invalid = ScheduleBundle::invalid();
+      calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
+                            ControlDependentMembers);
     }
     return std::nullopt;
   }
@@ -20210,10 +21010,6 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
     Value *V, const InstructionsState &S) {
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
-  assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
-         !doesNotNeedToBeScheduled(I) &&
-         "phi nodes/insertelements/extractelements/extractvalues don't need to "
-         "be scheduled");
   if (getScheduleData(I))
     return true;
   if (!ScheduleStart) {
@@ -20283,14 +21079,14 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
   ScheduleData *CurrentLoadStore = PrevLoadStore;
   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
     // No need to allocate data for non-schedulable instructions.
-    if (doesNotNeedToBeScheduled(I))
+    if (isa<PHINode>(I))
       continue;
     ScheduleData *SD = ScheduleDataMap.lookup(I);
     if (!SD) {
       SD = allocateScheduleDataChunks();
       ScheduleDataMap[I] = SD;
     }
-    assert(!isInSchedulingRegion(SD) &&
+    assert(!isInSchedulingRegion(*SD) &&
            "new ScheduleData already in scheduling region");
     SD->init(SchedulingRegionID, I);
 
@@ -20320,34 +21116,128 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
   }
 }
 
-void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
-                                                     bool InsertInReadyList,
-                                                     BoUpSLP *SLP) {
-  SmallVector<ScheduleData *> WorkList;
-  auto ProcessNode = [&](ScheduleData *BundleMember) {
+void BoUpSLP::BlockScheduling::calculateDependencies(
+    ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
+    ArrayRef<ScheduleData *> ControlDeps) {
+  SmallVector<ScheduleEntity *> WorkList;
+  auto ProcessNode = [&](ScheduleEntity *SE) {
+    if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
+      if (CD->hasValidDependencies())
+        return;
+      LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *CD << "\n");
+      CD->initDependencies();
+      CD->resetUnscheduledDeps();
+      const EdgeInfo &EI = CD->getEdgeInfo();
+      if (EI.UserTE) {
+        ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
+        const auto *It = find(Op, CD->getInst());
+        assert(It != Op.end() && "Lane not set");
+        SmallPtrSet<Instruction *, 4> Visited;
+        do {
+          int Lane = std::distance(Op.begin(), It);
+          assert(Lane >= 0 && "Lane not set");
+          if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
+              !EI.UserTE->ReorderIndices.empty())
+            Lane = EI.UserTE->ReorderIndices[Lane];
+          assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
+                 "Couldn't find extract lane");
+          auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
+          if (EI.UserTE->isCopyableElement(In)) {
+            // We may have not have related copyable scheduling data, if the
+            // instruction is non-schedulable.
+            if (ScheduleCopyableData *UseSD =
+                    getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
+              CD->incDependencies();
+              if (!UseSD->isScheduled())
+                CD->incrementUnscheduledDeps(1);
+              if (!UseSD->hasValidDependencies() ||
+                  (InsertInReadyList && UseSD->isReady()))
+                WorkList.push_back(UseSD);
+            }
+          } else if (Visited.insert(In).second) {
+            if (ScheduleData *UseSD = getScheduleData(In)) {
+              CD->incDependencies();
+              if (!UseSD->isScheduled())
+                CD->incrementUnscheduledDeps(1);
+              if (!UseSD->hasValidDependencies() ||
+                  (InsertInReadyList && UseSD->isReady()))
+                WorkList.push_back(UseSD);
+            }
+          }
+          It = find(make_range(std::next(It), Op.end()), CD->getInst());
+        } while (It != Op.end());
+        if (CD->isReady() && CD->getDependencies() == 0 &&
+            (EI.UserTE->hasState() &&
+             (EI.UserTE->getMainOp()->getParent() !=
+                  CD->getInst()->getParent() ||
+              (isa<PHINode>(EI.UserTE->getMainOp()) &&
+               (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
+                any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
+                  auto *IU = dyn_cast<Instruction>(U);
+                  if (!IU)
+                    return true;
+                  return IU->getParent() == EI.UserTE->getMainOp()->getParent();
+                })))))) {
+          // If no uses in the block - mark as having pseudo-use, which cannot
+          // be scheduled.
+          // Prevents incorrect def-use tracking between external user and
+          // actual instruction.
+          CD->incDependencies();
+          CD->incrementUnscheduledDeps(1);
+        }
+      }
+      return;
+    }
+    auto *BundleMember = cast<ScheduleData>(SE);
     if (BundleMember->hasValidDependencies())
       return;
     LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
     BundleMember->initDependencies();
     BundleMember->resetUnscheduledDeps();
     // Handle def-use chain dependencies.
+    SmallDenseMap<Value *, unsigned> UserToNumOps;
     for (User *U : BundleMember->getInst()->users()) {
+      if (isa<PHINode>(U))
+        continue;
       if (ScheduleData *UseSD = getScheduleData(U)) {
+        // The operand is a copyable element - skip.
+        unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
+        ++NumOps;
+        if (areAllOperandsReplacedByCopyableData(
+                cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
+          continue;
         BundleMember->incDependencies();
         if (!UseSD->isScheduled())
           BundleMember->incrementUnscheduledDeps(1);
-        WorkList.push_back(UseSD);
+        if (!UseSD->hasValidDependencies() ||
+            (InsertInReadyList && UseSD->isReady()))
+          WorkList.push_back(UseSD);
       }
     }
+    for (ScheduleCopyableData *UseSD :
+         getScheduleCopyableDataUsers(BundleMember->getInst())) {
+      BundleMember->incDependencies();
+      if (!UseSD->isScheduled())
+        BundleMember->incrementUnscheduledDeps(1);
+      if (!UseSD->hasValidDependencies() ||
+          (InsertInReadyList && UseSD->isReady()))
+        WorkList.push_back(UseSD);
+    }
 
+    SmallPtrSet<const Instruction *, 4> Visited;
     auto MakeControlDependent = [&](Instruction *I) {
+      // Do not mark control dependent twice.
+      if (!Visited.insert(I).second)
+        return;
       auto *DepDest = getScheduleData(I);
       assert(DepDest && "must be in schedule window");
       DepDest->addControlDependency(BundleMember);
       BundleMember->incDependencies();
       if (!DepDest->isScheduled())
         BundleMember->incrementUnscheduledDeps(1);
-      WorkList.push_back(DepDest);
+      if (!DepDest->hasValidDependencies() ||
+          (InsertInReadyList && DepDest->isReady()))
+        WorkList.push_back(DepDest);
     };
 
     // Any instruction which isn't safe to speculate at the beginning of the
@@ -20426,7 +21316,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
 
     for (ScheduleData *DepDest = NextLoadStore; DepDest;
          DepDest = DepDest->getNextLoadStore()) {
-      assert(isInSchedulingRegion(DepDest) && "Expected to be in region");
+      assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
 
       // We have two limits to reduce the complexity:
       // 1) AliasedCheckLimit: It's a small limit to reduce calls to
@@ -20449,7 +21339,9 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
         BundleMember->incDependencies();
         if (!DepDest->isScheduled())
           BundleMember->incrementUnscheduledDeps(1);
-        WorkList.push_back(DepDest);
+        if (!DepDest->hasValidDependencies() ||
+            (InsertInReadyList && DepDest->isReady()))
+          WorkList.push_back(DepDest);
       }
 
       // Example, explaining the loop break condition: Let's assume our
@@ -20471,13 +21363,25 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
     }
   };
 
-  WorkList.push_back(Bundle.getBundle().front());
+  assert((Bundle || !ControlDeps.empty()) &&
+         "expected at least one instruction to schedule");
+  if (Bundle)
+    WorkList.push_back(Bundle.getBundle().front());
+  WorkList.append(ControlDeps.begin(), ControlDeps.end());
   SmallPtrSet<ScheduleBundle *, 16> Visited;
   while (!WorkList.empty()) {
-    ScheduleData *SD = WorkList.pop_back_val();
-    ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(SD->getInst());
+    ScheduleEntity *SD = WorkList.pop_back_val();
+    SmallVector<ScheduleBundle *, 1> CopyableBundle;
+    ArrayRef<ScheduleBundle *> Bundles;
+    if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
+      CopyableBundle.push_back(&CD->getBundle());
+      Bundles = CopyableBundle;
+    } else {
+      Bundles = getScheduleBundles(SD->getInst());
+    }
     if (Bundles.empty()) {
-      ProcessNode(SD);
+      if (!SD->hasValidDependencies())
+        ProcessNode(SD);
       if (InsertInReadyList && SD->isReady()) {
         ReadyInsts.insert(SD);
         LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD << "\n");
@@ -20485,7 +21389,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
       continue;
     }
     for (ScheduleBundle *Bundle : Bundles) {
-      if (!Visited.insert(Bundle).second || Bundle->hasValidDependencies())
+      if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
         continue;
       assert(isInSchedulingRegion(*Bundle) &&
              "ScheduleData not in scheduling region");
@@ -20508,23 +21412,40 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
 void BoUpSLP::BlockScheduling::resetSchedule() {
   assert(ScheduleStart &&
          "tried to reset schedule on block which has not been scheduled");
-  for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-    if (ScheduleData *SD = getScheduleData(I)) {
-      assert(isInSchedulingRegion(SD) &&
-             "ScheduleData not in scheduling region");
+  for_each(ScheduleDataMap, [&](auto &P) {
+    if (BB != P.first->getParent())
+      return;
+    ScheduleData *SD = P.second;
+    if (isInSchedulingRegion(*SD)) {
       SD->setScheduled(/*Scheduled=*/false);
       SD->resetUnscheduledDeps();
     }
-    for (ScheduleBundle *Bundle : getScheduleBundles(I)) {
-      assert(isInSchedulingRegion(*Bundle) &&
-             "ScheduleBundle not in scheduling region");
-      Bundle->setScheduled(/*Scheduled=*/false);
+  });
+  for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
+    for_each(P.second, [&](ScheduleCopyableData *SD) {
+      if (isInSchedulingRegion(*SD)) {
+        SD->setScheduled(/*Scheduled=*/false);
+        SD->resetUnscheduledDeps();
+      }
+    });
+  });
+  for_each(ScheduledBundles, [&](auto &P) {
+    for_each(P.second, [&](ScheduleBundle *Bundle) {
+      if (isInSchedulingRegion(*Bundle))
+        Bundle->setScheduled(/*Scheduled=*/false);
+    });
+  });
+  // Reset schedule data for copyable elements.
+  for (auto &P : ScheduleCopyableDataMap) {
+    if (isInSchedulingRegion(*P.second)) {
+      P.second->setScheduled(/*Scheduled=*/false);
+      P.second->resetUnscheduledDeps();
     }
   }
   ReadyInsts.clear();
 }
 
-void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
 
@@ -20562,15 +21483,45 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
         if (!Bundle->hasValidDependencies())
           BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
       }
+      SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
+      for (ScheduleCopyableData *SD : reverse(SDs)) {
+        ScheduleBundle &Bundle = SD->getBundle();
+        Bundle.setSchedulingPriority(Idx++);
+        if (!Bundle.hasValidDependencies())
+          BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+      }
       continue;
     }
+    SmallVector<ScheduleCopyableData *> CopyableData =
+        BS->getScheduleCopyableDataUsers(I);
     if (ScheduleData *SD = BS->getScheduleData(I)) {
       [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
       assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
-              SDTEs.front()->doesNotNeedToSchedule()) &&
+              SDTEs.front()->doesNotNeedToSchedule() ||
+              doesNotNeedToBeScheduled(I)) &&
              "scheduler and vectorizer bundle mismatch");
       SD->setSchedulingPriority(Idx++);
-      continue;
+      if (!SD->hasValidDependencies() &&
+          (!CopyableData.empty() ||
+           any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
+             assert(TE->isGather() && "expected gather node");
+             return TE->hasState() && TE->hasCopyableElements() &&
+                    TE->isCopyableElement(I);
+           }))) {
+        // Need to calculate deps for these nodes to correctly handle copyable
+        // dependencies, even if they were cancelled.
+        // If copyables bundle was cancelled, the deps are cleared and need to
+        // recalculate them.
+        ScheduleBundle Bundle;
+        Bundle.add(SD);
+        BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+      }
+    }
+    for (ScheduleCopyableData *SD : reverse(CopyableData)) {
+      ScheduleBundle &Bundle = SD->getBundle();
+      Bundle.setSchedulingPriority(Idx++);
+      if (!Bundle.hasValidDependencies())
+        BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
     }
   }
   BS->initialFillReadyList(ReadyInsts);
@@ -20586,9 +21537,12 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
     if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
-      for (const ScheduleData *BundleMember : Bundle->getBundle()) {
+      for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
         Instruction *PickedInst = BundleMember->getInst();
-        if (!Scheduled.insert(PickedInst).second)
+        // If copyable must be schedule as part of something else, skip it.
+        bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
+        if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
+            (!IsCopyable && !Scheduled.insert(PickedInst).second))
           continue;
         if (PickedInst->getNextNode() != LastScheduledInst)
           PickedInst->moveAfter(LastScheduledInst->getPrevNode());
@@ -20603,7 +21557,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
         PickedInst->moveAfter(LastScheduledInst->getPrevNode());
       LastScheduledInst = PickedInst;
     }
-    BS->schedule(Picked, ReadyInsts);
+    auto Invalid = InstructionsState::invalid();
+    BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
   }
 
   // Check that we didn't break any of our invariants.
@@ -20965,9 +21920,11 @@ bool BoUpSLP::collectValuesToDemote(
       return all_of(E.Scalars, [&](Value *V) {
         if (isa<PoisonValue>(V))
           return true;
+        APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+        if (E.isCopyableElement(V))
+          return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
         auto *I = cast<Instruction>(V);
         KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
-        APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
         return AmtKnownBits.getMaxValue().ult(BitWidth) &&
                MaskedValueIsZero(I->getOperand(0), ShiftedBits,
                                  SimplifyQuery(*DL));
@@ -22729,21 +23686,11 @@ public:
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
                                  ScalarEvolution &SE, const DataLayout &DL,
-                                 const TargetLibraryInfo &TLI,
-                                 DominatorTree &DT, TargetTransformInfo &TTI) {
+                                 const TargetLibraryInfo &TLI) {
     RdxKind = HorizontalReduction::getRdxKind(Root);
     if (!isVectorizable(RdxKind, Root))
       return false;
 
-    // FMA reduction root - skip.
-    auto CheckForFMA = [&](Instruction *I) {
-      return RdxKind == RecurKind::FAdd &&
-             canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
-                 .isValid();
-    };
-    if (CheckForFMA(Root))
-      return false;
-
     // Analyze "regular" integer/FP types for reductions - no target-specific
     // types or pointers.
     Type *Ty = Root->getType();
@@ -22781,7 +23728,7 @@ public:
         // Also, do not try to reduce const values, if the operation is not
         // foldable.
         if (!EdgeInst || Level > RecursionMaxDepth ||
-            getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) ||
+            getRdxKind(EdgeInst) != RdxKind ||
             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
             !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
             !isVectorizable(RdxKind, EdgeInst) ||
@@ -23530,7 +24477,7 @@ public:
       // correct, replace internal uses with undef, and mark for eventual
       // deletion.
 #ifndef NDEBUG
-      SmallSet<Value *, 4> IgnoreSet;
+      SmallPtrSet<Value *, 4> IgnoreSet;
       for (ArrayRef<Value *> RdxOps : ReductionOps)
         IgnoreSet.insert_range(RdxOps);
 #endif
@@ -23843,6 +24790,8 @@ private:
         case RecurKind::FMinimum:
           // res = vv
           break;
+        case RecurKind::Sub:
+        case RecurKind::AddChainWithSubs:
         case RecurKind::Mul:
         case RecurKind::FMul:
         case RecurKind::FMulAdd:
@@ -23982,6 +24931,8 @@ private:
     case RecurKind::FMinimum:
       // res = vv
       return VectorizedValue;
+    case RecurKind::Sub:
+    case RecurKind::AddChainWithSubs:
     case RecurKind::Mul:
     case RecurKind::FMul:
     case RecurKind::FMulAdd:
@@ -24086,6 +25037,8 @@ private:
       auto *Scale = ConstantVector::get(Vals);
       return Builder.CreateFMul(VectorizedValue, Scale);
     }
+    case RecurKind::Sub:
+    case RecurKind::AddChainWithSubs:
     case RecurKind::Mul:
     case RecurKind::FMul:
     case RecurKind::FMulAdd:
@@ -24356,7 +25309,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
     if (!isReductionCandidate(Inst))
       return nullptr;
     HorizontalReduction HorRdx;
-    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI))
+    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
     return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
   };
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index f32d57f..e414c12 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -81,6 +81,7 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
   case Instruction::Opcode::FPToUI:
   case Instruction::Opcode::FPToSI:
   case Instruction::Opcode::FPExt:
+  case Instruction::Opcode::PtrToAddr:
   case Instruction::Opcode::PtrToInt:
   case Instruction::Opcode::IntToPtr:
   case Instruction::Opcode::SIToFP:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 73babcc..f972efa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -246,8 +246,7 @@ VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
                                    IRBuilderBase &Builder, VPlan *Plan,
                                    Loop *CurrentParentLoop, Type *CanonicalIVTy)
     : TTI(TTI), VF(VF), CFG(DT), LI(LI), AC(AC), Builder(Builder), Plan(Plan),
-      CurrentParentLoop(CurrentParentLoop), TypeAnalysis(CanonicalIVTy),
-      VPDT(*Plan) {}
+      CurrentParentLoop(CurrentParentLoop), TypeAnalysis(*Plan), VPDT(*Plan) {}
 
 Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
   if (Def->isLiveIn())
@@ -296,27 +295,11 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
   if (hasVectorValue(Def))
     return Data.VPV2Vector[Def];
 
-  auto GetBroadcastInstrs = [this, Def](Value *V) {
-    bool SafeToHoist =
-        !Def->hasDefiningRecipe() ||
-        VPDT.properlyDominates(Def->getDefiningRecipe()->getParent(),
-                               Plan->getVectorPreheader());
-
+  auto GetBroadcastInstrs = [this](Value *V) {
     if (VF.isScalar())
       return V;
-    // Place the code for broadcasting invariant variables in the new preheader.
-    IRBuilder<>::InsertPointGuard Guard(Builder);
-    if (SafeToHoist) {
-      BasicBlock *LoopVectorPreHeader =
-          CFG.VPBB2IRBB[Plan->getVectorPreheader()];
-      if (LoopVectorPreHeader)
-        Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
-    }
-
-    // Place the code for broadcasting invariant variables in the new preheader.
     // Broadcast the scalar into all locations in the vector.
     Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
-
     return Shuf;
   };
 
@@ -372,6 +355,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
     set(Def, VectorValue);
   } else {
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+    assert(isa<VPInstruction>(Def) &&
+           "Explicit BuildVector recipes must have"
+           "handled packing for non-VPInstructions.");
     // Initialize packing with insertelements to start from poison.
     VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
@@ -951,28 +937,6 @@ VPlan::~VPlan() {
     delete BackedgeTakenCount;
 }
 
-void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) {
-  if (!VectorTripCount.getUnderlyingValue())
-    VectorTripCount.setUnderlyingValue(VectorTripCountV);
-  else
-    assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV &&
-           "VectorTripCount set earlier must much VectorTripCountV");
-
-  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
-  Type *TCTy = VectorTripCountV->getType();
-  // FIXME: Model VF * UF computation completely in VPlan.
-  unsigned UF = getUF();
-  if (VF.getNumUsers()) {
-    Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
-    VF.setUnderlyingValue(RuntimeVF);
-    VFxUF.setUnderlyingValue(
-        UF > 1 ? Builder.CreateMul(RuntimeVF, ConstantInt::get(TCTy, UF))
-               : RuntimeVF);
-  } else {
-    VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));
-  }
-}
-
 VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const {
   auto Iter = find_if(getExitBlocks(), [IRBB](const VPIRBasicBlock *VPIRBB) {
     return VPIRBB->getIRBasicBlock() == IRBB;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c42cdd5..46e55be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1019,7 +1019,11 @@ public:
     /// The lane specifies an index into a vector formed by combining all vector
     /// operands (all operands after the first one).
     ExtractLane,
-
+    /// Explicit user for the resume phi of the canonical induction in the main
+    /// VPlan, used by the epilogue vector loop.
+    ResumeForEpilogue,
+    /// Returns the value for vscale.
+    VScale,
   };
 
 private:
@@ -1167,6 +1171,7 @@ public:
     switch (VPI->getOpcode()) {
     case VPInstruction::WideIVStep:
     case VPInstruction::StepVector:
+    case VPInstruction::VScale:
       return true;
     default:
       return false;
@@ -1227,6 +1232,31 @@ public:
     return getAsRecipe()->getNumOperands();
   }
 
+  /// Returns an interator range over the incoming values.
+  VPUser::const_operand_range incoming_values() const {
+    return make_range(getAsRecipe()->op_begin(),
+                      getAsRecipe()->op_begin() + getNumIncoming());
+  }
+
+  using const_incoming_blocks_range = iterator_range<mapped_iterator<
+      detail::index_iterator, std::function<const VPBasicBlock *(size_t)>>>;
+
+  /// Returns an iterator range over the incoming blocks.
+  const_incoming_blocks_range incoming_blocks() const {
+    std::function<const VPBasicBlock *(size_t)> GetBlock = [this](size_t Idx) {
+      return getIncomingBlock(Idx);
+    };
+    return map_range(index_range(0, getNumIncoming()), GetBlock);
+  }
+
+  /// Returns an iterator range over pairs of incoming values and corresponding
+  /// incoming blocks.
+  detail::zippy<llvm::detail::zip_first, VPUser::const_operand_range,
+                const_incoming_blocks_range>
+  incoming_values_and_blocks() const {
+    return zip_equal(incoming_values(), incoming_blocks());
+  }
+
   /// Removes the incoming value for \p IncomingBlock, which must be a
   /// predecessor.
   void removeIncomingValueFor(VPBlockBase *IncomingBlock) const;
@@ -2298,6 +2328,11 @@ public:
              VPSlotTracker &SlotTracker) const override;
 #endif
 
+  /// Returns the number of incoming values, also number of incoming blocks.
+  /// Note that at the moment, VPWidenPointerInductionRecipe only has a single
+  /// incoming value, its start value.
+  unsigned getNumIncoming() const override { return 2; }
+
   /// Returns the recurrence kind of the reduction.
   RecurKind getRecurrenceKind() const { return Kind; }
 
@@ -2408,11 +2443,11 @@ public:
     // TODO: extend the masked interleaved-group support to reversed access.
     assert((!Mask || !IG->isReverse()) &&
            "Reversed masked interleave-group not supported.");
-    for (unsigned i = 0; i < IG->getFactor(); ++i)
-      if (Instruction *I = IG->getMember(i)) {
-        if (I->getType()->isVoidTy())
+    for (unsigned I = 0; I < IG->getFactor(); ++I)
+      if (Instruction *Inst = IG->getMember(I)) {
+        if (Inst->getType()->isVoidTy())
           continue;
-        new VPValue(I, this);
+        new VPValue(Inst, this);
       }
 
     for (auto *SV : StoredValues)
@@ -3076,10 +3111,11 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
 /// using the address to load from, the explicit vector length and an optional
 /// mask.
 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
-  VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
+  VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
+                       VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
-                            {L.getAddr(), &EVL}, L.isConsecutive(),
-                            L.isReverse(), L, L.getDebugLoc()),
+                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
+                            L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -3157,11 +3193,11 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
 /// using the value to store, the address to store to, the explicit vector
 /// length and an optional mask.
 struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
-  VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
+  VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr, VPValue &EVL,
+                        VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
-                            {S.getAddr(), S.getStoredValue(), &EVL},
-                            S.isConsecutive(), S.isReverse(), S,
-                            S.getDebugLoc()) {
+                            {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
+                            S.isReverse(), S, S.getDebugLoc()) {
     setMask(Mask);
   }
 
@@ -3968,9 +4004,6 @@ public:
     VPBB->setPlan(this);
   }
 
-  /// Prepare the plan for execution, setting up the required live-in values.
-  void prepareToExecute(Value *VectorTripCount, VPTransformState &State);
-
   /// Generate the IR code for this VPlan.
   void execute(VPTransformState *State);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 4c3cdda..b39231f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -21,8 +21,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "vplan"
 
-VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan)
-    : Ctx(Plan.getScalarHeader()->getIRBasicBlock()->getContext()) {
+VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) : Ctx(Plan.getContext()) {
   if (auto LoopRegion = Plan.getVectorLoopRegion()) {
     if (const auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(
             &LoopRegion->getEntryBasicBlock()->front())) {
@@ -74,6 +73,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case Instruction::ExtractElement:
   case Instruction::Freeze:
   case VPInstruction::ReductionStartVector:
+  case VPInstruction::ResumeForEpilogue:
     return inferScalarType(R->getOperand(0));
   case Instruction::Select: {
     Type *ResTy = inferScalarType(R->getOperand(1));
@@ -500,7 +500,7 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
 
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
-  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+  VPTypeAnalysis TypeInfo(Plan);
 
   const auto &TTICapture = TTI;
   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index cd86d27..c6c4369 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -58,9 +58,6 @@ class VPTypeAnalysis {
   Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
 
 public:
-  VPTypeAnalysis(Type *CanonicalIVTy)
-      : CanonicalIVTy(CanonicalIVTy), Ctx(CanonicalIVTy->getContext()) {}
-
   VPTypeAnalysis(const VPlan &Plan);
 
   /// Infer the type of \p V. Returns the scalar type of \p V.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 7e8eff31..b231a84 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -336,12 +336,6 @@ std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
   return std::move(Plan);
 }
 
-std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(Loop *TheLoop,
-                                                      LoopInfo &LI) {
-  PlainCFGBuilder Builder(TheLoop, &LI);
-  return Builder.buildPlainCFG();
-}
-
 /// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it
 /// has exactly 2 predecessors (preheader and latch), where the block
 /// dominates the latch and the preheader dominates the block. If it is a
@@ -457,10 +451,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
                        LatchDL);
 }
 
-void VPlanTransforms::prepareForVectorization(
-    VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE,
-    bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop,
-    DebugLoc IVDL, bool HasUncountableEarlyExit, VFRange &Range) {
+static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
+                               PredicatedScalarEvolution &PSE, Loop *TheLoop) {
   VPDominatorTree VPDT;
   VPDT.recalculate(Plan);
 
@@ -486,12 +478,54 @@ void VPlanTransforms::prepareForVectorization(
 
   addCanonicalIVRecipes(Plan, HeaderVPBB, LatchVPBB, InductionTy, IVDL);
 
-  [[maybe_unused]] bool HandledUncountableEarlyExit = false;
+  // Create SCEV and VPValue for the trip count.
+  // We use the symbolic max backedge-taken-count, which works also when
+  // vectorizing loops with uncountable early exits.
+  const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
+  assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
+         "Invalid backedge-taken count");
+  ScalarEvolution &SE = *PSE.getSE();
+  const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
+                                                       InductionTy, TheLoop);
+  Plan.setTripCount(
+      vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE));
+
+  VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph");
+  VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader());
+
+  // The connection order corresponds to the operands of the conditional branch,
+  // with the middle block already connected to the exit block.
+  VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  // Also connect the entry block to the scalar preheader.
+  // TODO: Also introduce a branch recipe together with the minimum trip count
+  // check.
+  VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
+  Plan.getEntry()->swapSuccessors();
+}
+
+std::unique_ptr<VPlan>
+VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
+                             DebugLoc IVDL, PredicatedScalarEvolution &PSE) {
+  PlainCFGBuilder Builder(TheLoop, &LI);
+  std::unique_ptr<VPlan> VPlan0 = Builder.buildPlainCFG();
+  addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop);
+  return VPlan0;
+}
+
+void VPlanTransforms::handleEarlyExits(VPlan &Plan,
+                                       bool HasUncountableEarlyExit,
+                                       VFRange &Range) {
+  auto *MiddleVPBB = cast<VPBasicBlock>(
+      Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
+  auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
+  VPBlockBase *HeaderVPB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[1]);
+
   // Disconnect all early exits from the loop leaving it with a single exit from
   // the latch. Early exits that are countable are left for a scalar epilog. The
   // condition of uncountable early exits (currently at most one is supported)
   // is fused into the latch exit, and used to branch from middle block to the
   // early exit destination.
+  [[maybe_unused]] bool HandledUncountableEarlyExit = false;
   for (VPIRBasicBlock *EB : Plan.getExitBlocks()) {
     for (VPBlockBase *Pred : to_vector(EB->getPredecessors())) {
       if (Pred == MiddleVPBB)
@@ -500,7 +534,8 @@ void VPlanTransforms::prepareForVectorization(
         assert(!HandledUncountableEarlyExit &&
                "can handle exactly one uncountable early exit");
         handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
-                                   HeaderVPBB, LatchVPBB, Range);
+                                   cast<VPBasicBlock>(HeaderVPB), LatchVPBB,
+                                   Range);
         HandledUncountableEarlyExit = true;
       } else {
         for (VPRecipeBase &R : EB->phis())
@@ -513,36 +548,18 @@ void VPlanTransforms::prepareForVectorization(
 
   assert((!HasUncountableEarlyExit || HandledUncountableEarlyExit) &&
          "missed an uncountable exit that must be handled");
+}
 
-  // Create SCEV and VPValue for the trip count.
-  // We use the symbolic max backedge-taken-count, which works also when
-  // vectorizing loops with uncountable early exits.
-  const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
-  assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
-         "Invalid loop count");
-  ScalarEvolution &SE = *PSE.getSE();
-  const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
-                                                       InductionTy, TheLoop);
-  Plan.setTripCount(
-      vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE));
-
-  VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph");
-  VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader());
-
-  // The connection order corresponds to the operands of the conditional branch,
-  // with the middle block already connected to the exit block.
-  VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
-  // Also connect the entry block to the scalar preheader.
-  // TODO: Also introduce a branch recipe together with the minimum trip count
-  // check.
-  VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
-  Plan.getEntry()->swapSuccessors();
-
+void VPlanTransforms::addMiddleCheck(VPlan &Plan,
+                                     bool RequiresScalarEpilogueCheck,
+                                     bool TailFolded) {
+  auto *MiddleVPBB = cast<VPBasicBlock>(
+      Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
   // If MiddleVPBB has a single successor then the original loop does not exit
   // via the latch and the single successor must be the scalar preheader.
   // There's no need to add a runtime check to MiddleVPBB.
   if (MiddleVPBB->getNumSuccessors() == 1) {
-    assert(MiddleVPBB->getSingleSuccessor() == ScalarPH &&
+    assert(MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader() &&
            "must have ScalarPH as single successor");
     return;
   }
@@ -564,6 +581,7 @@ void VPlanTransforms::prepareForVectorization(
   // the corresponding compare because they may have ended up with different
   // line numbers and we want to avoid awkward line stepping while debugging.
   // E.g., if the compare has got a line number inside the loop.
+  auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
   DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
   VPBuilder Builder(MiddleVPBB);
   VPValue *Cmp;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 4154720c..5ad2ac6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -351,10 +351,10 @@ struct VPCostContext {
   TargetTransformInfo::TargetCostKind CostKind;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
-                Type *CanIVTy, LoopVectorizationCostModel &CM,
+                const VPlan &Plan, LoopVectorizationCostModel &CM,
                 TargetTransformInfo::TargetCostKind CostKind)
-      : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
-        CM(CM), CostKind(CostKind) {}
+      : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
+        CostKind(CostKind) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 8818843..9f036fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -200,15 +200,11 @@ template <typename Ops_t, unsigned Opcode, bool Commutative,
 struct Recipe_match {
   Ops_t Ops;
 
-  Recipe_match() : Ops() {
-    static_assert(std::tuple_size<Ops_t>::value == 0 &&
-                  "constructor can only be used with zero operands");
-  }
-  Recipe_match(Ops_t Ops) : Ops(Ops) {}
-  template <typename A_t, typename B_t>
-  Recipe_match(A_t A, B_t B) : Ops({A, B}) {
-    static_assert(std::tuple_size<Ops_t>::value == 2 &&
-                  "constructor can only be used for binary matcher");
+  template <typename... OpTy> Recipe_match(OpTy... Ops) : Ops(Ops...) {
+    static_assert(std::tuple_size<Ops_t>::value == sizeof...(Ops) &&
+                  "number of operands in constructor doesn't match Ops_t");
+    static_assert((!Commutative || std::tuple_size<Ops_t>::value == 2) &&
+                  "only binary ops can be commutative");
   }
 
   bool match(const VPValue *V) const {
@@ -254,7 +250,6 @@ private:
     // Check for recipes that do not have opcodes.
     if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
                   std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
-                  std::is_same<RecipeTy, VPWidenSelectRecipe>::value ||
                   std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
                   std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
       return DefR;
@@ -270,195 +265,128 @@ private:
   }
 };
 
-template <unsigned Opcode, typename... RecipeTys>
-using ZeroOpRecipe_match =
-    Recipe_match<std::tuple<>, Opcode, false, RecipeTys...>;
-
-template <typename Op0_t, unsigned Opcode, typename... RecipeTys>
-using UnaryRecipe_match =
-    Recipe_match<std::tuple<Op0_t>, Opcode, false, RecipeTys...>;
-
-template <typename Op0_t, unsigned Opcode>
-using UnaryVPInstruction_match =
-    UnaryRecipe_match<Op0_t, Opcode, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+using AllRecipe_match =
+    Recipe_match<std::tuple<OpTys...>, Opcode, /*Commutative*/ false,
+                 VPWidenRecipe, VPReplicateRecipe, VPWidenCastRecipe,
+                 VPInstruction, VPWidenSelectRecipe>;
 
-template <unsigned Opcode>
-using ZeroOpVPInstruction_match = ZeroOpRecipe_match<Opcode, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+using AllRecipe_commutative_match =
+    Recipe_match<std::tuple<OpTys...>, Opcode, /*Commutative*/ true,
+                 VPWidenRecipe, VPReplicateRecipe, VPInstruction>;
 
-template <typename Op0_t, unsigned Opcode>
-using AllUnaryRecipe_match =
-    UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
-                      VPWidenCastRecipe, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+using VPInstruction_match = Recipe_match<std::tuple<OpTys...>, Opcode,
+                                         /*Commutative*/ false, VPInstruction>;
 
-template <typename Op0_t, typename Op1_t, unsigned Opcode, bool Commutative,
-          typename... RecipeTys>
-using BinaryRecipe_match =
-    Recipe_match<std::tuple<Op0_t, Op1_t>, Opcode, Commutative, RecipeTys...>;
-
-template <typename Op0_t, typename Op1_t, unsigned Opcode>
-using BinaryVPInstruction_match =
-    BinaryRecipe_match<Op0_t, Op1_t, Opcode, /*Commutative*/ false,
-                       VPInstruction>;
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode,
-          bool Commutative, typename... RecipeTys>
-using TernaryRecipe_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>,
-                                         Opcode, Commutative, RecipeTys...>;
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode>
-using TernaryVPInstruction_match =
-    TernaryRecipe_match<Op0_t, Op1_t, Op2_t, Opcode, /*Commutative*/ false,
-                        VPInstruction>;
-
-template <typename Op0_t, typename Op1_t, unsigned Opcode,
-          bool Commutative = false>
-using AllBinaryRecipe_match =
-    BinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative, VPWidenRecipe,
-                       VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+inline VPInstruction_match<Opcode, OpTys...>
+m_VPInstruction(const OpTys &...Ops) {
+  return VPInstruction_match<Opcode, OpTys...>(Ops...);
+}
 
 /// BuildVector is matches only its opcode, w/o matching its operands as the
 /// number of operands is not fixed.
-inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() {
-  return ZeroOpVPInstruction_match<VPInstruction::BuildVector>();
-}
-
-template <unsigned Opcode, typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, Opcode>
-m_VPInstruction(const Op0_t &Op0) {
-  return UnaryVPInstruction_match<Op0_t, Opcode>(Op0);
-}
-
-template <unsigned Opcode, typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, Opcode>
-m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1) {
-  return BinaryVPInstruction_match<Op0_t, Op1_t, Opcode>(Op0, Op1);
+inline VPInstruction_match<VPInstruction::BuildVector> m_BuildVector() {
+  return m_VPInstruction<VPInstruction::BuildVector>();
 }
 
-template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t>
-inline TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode>
-m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
-  return TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode>(
-      {Op0, Op1, Op2});
-}
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
-          unsigned Opcode, bool Commutative, typename... RecipeTys>
-using Recipe4Op_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t, Op3_t>,
-                                     Opcode, Commutative, RecipeTys...>;
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
-          unsigned Opcode>
-using VPInstruction4Op_match =
-    Recipe4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode, /*Commutative*/ false,
-                    VPInstruction>;
-
-template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t,
-          typename Op3_t>
-inline VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>
-m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2,
-                const Op3_t &Op3) {
-  return VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>(
-      {Op0, Op1, Op2, Op3});
-}
 template <typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, Instruction::Freeze>
+inline VPInstruction_match<Instruction::Freeze, Op0_t>
 m_Freeze(const Op0_t &Op0) {
   return m_VPInstruction<Instruction::Freeze>(Op0);
 }
 
 template <typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, VPInstruction::BranchOnCond>
+inline VPInstruction_match<VPInstruction::BranchOnCond, Op0_t>
 m_BranchOnCond(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::BranchOnCond>(Op0);
 }
 
 template <typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, VPInstruction::Broadcast>
+inline VPInstruction_match<VPInstruction::Broadcast, Op0_t>
 m_Broadcast(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::Broadcast>(Op0);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
+inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t>
 m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
   return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount>
+inline VPInstruction_match<VPInstruction::BranchOnCount, Op0_t, Op1_t>
 m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
   return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
 }
 
 template <unsigned Opcode, typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Opcode> m_Unary(const Op0_t &Op0) {
-  return AllUnaryRecipe_match<Op0_t, Opcode>(Op0);
+inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) {
+  return AllRecipe_match<Opcode, Op0_t>(Op0);
 }
 
 template <typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Instruction::Trunc>
-m_Trunc(const Op0_t &Op0) {
+inline AllRecipe_match<Instruction::Trunc, Op0_t> m_Trunc(const Op0_t &Op0) {
   return m_Unary<Instruction::Trunc, Op0_t>(Op0);
 }
 
 template <typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Instruction::ZExt> m_ZExt(const Op0_t &Op0) {
+inline AllRecipe_match<Instruction::ZExt, Op0_t> m_ZExt(const Op0_t &Op0) {
   return m_Unary<Instruction::ZExt, Op0_t>(Op0);
 }
 
 template <typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Instruction::SExt> m_SExt(const Op0_t &Op0) {
+inline AllRecipe_match<Instruction::SExt, Op0_t> m_SExt(const Op0_t &Op0) {
   return m_Unary<Instruction::SExt, Op0_t>(Op0);
 }
 
 template <typename Op0_t>
-inline match_combine_or<AllUnaryRecipe_match<Op0_t, Instruction::ZExt>,
-                        AllUnaryRecipe_match<Op0_t, Instruction::SExt>>
+inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>,
+                        AllRecipe_match<Instruction::SExt, Op0_t>>
 m_ZExtOrSExt(const Op0_t &Op0) {
   return m_CombineOr(m_ZExt(Op0), m_SExt(Op0));
 }
 
-template <unsigned Opcode, typename Op0_t, typename Op1_t,
-          bool Commutative = false>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>
-m_Binary(const Op0_t &Op0, const Op1_t &Op1) {
-  return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>(Op0, Op1);
+template <unsigned Opcode, typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0,
+                                                      const Op1_t &Op1) {
+  return AllRecipe_match<Opcode, Op0_t, Op1_t>(Op0, Op1);
 }
 
 template <unsigned Opcode, typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>
+inline AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>
 m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) {
-  return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>(Op0, Op1);
+  return AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul>
-m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
+inline AllRecipe_match<Instruction::Mul, Op0_t, Op1_t> m_Mul(const Op0_t &Op0,
+                                                             const Op1_t &Op1) {
   return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul,
-                             /* Commutative =*/true>
+inline AllRecipe_commutative_match<Instruction::Mul, Op0_t, Op1_t>
 m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_Binary<Instruction::Mul, Op0_t, Op1_t, true>(Op0, Op1);
+  return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
 }
 
 /// Match a binary OR operation. Note that while conceptually the operands can
 /// be matched commutatively, \p Commutative defaults to false in line with the
 /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative
 /// version of the matcher.
-template <typename Op0_t, typename Op1_t, bool Commutative = false>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or, Commutative>
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Instruction::Or, Op0_t, Op1_t>
 m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_Binary<Instruction::Or, Op0_t, Op1_t, Commutative>(Op0, Op1);
+  return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or,
-                             /*Commutative*/ true>
+inline AllRecipe_commutative_match<Instruction::Or, Op0_t, Op1_t>
 m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1);
+  return m_c_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
 }
 
 /// ICmp_match is a variant of BinaryRecipe_match that also binds the comparison
@@ -523,9 +451,9 @@ m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) {
 
 template <typename Op0_t, typename Op1_t>
 using GEPLikeRecipe_match =
-    BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false,
-                       VPWidenRecipe, VPReplicateRecipe, VPWidenGEPRecipe,
-                       VPInstruction>;
+    Recipe_match<std::tuple<Op0_t, Op1_t>, Instruction::GetElementPtr,
+                 /*Commutative*/ false, VPWidenRecipe, VPReplicateRecipe,
+                 VPWidenGEPRecipe, VPInstruction>;
 
 template <typename Op0_t, typename Op1_t>
 inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0,
@@ -533,22 +461,17 @@ inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0,
   return GEPLikeRecipe_match<Op0_t, Op1_t>(Op0, Op1);
 }
 
-template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode>
-using AllTernaryRecipe_match =
-    Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, Opcode, false,
-                 VPReplicateRecipe, VPInstruction, VPWidenSelectRecipe>;
-
 template <typename Op0_t, typename Op1_t, typename Op2_t>
-inline AllTernaryRecipe_match<Op0_t, Op1_t, Op2_t, Instruction::Select>
+inline AllRecipe_match<Instruction::Select, Op0_t, Op1_t, Op2_t>
 m_Select(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
-  return AllTernaryRecipe_match<Op0_t, Op1_t, Op2_t, Instruction::Select>(
+  return AllRecipe_match<Instruction::Select, Op0_t, Op1_t, Op2_t>(
       {Op0, Op1, Op2});
 }
 
 template <typename Op0_t>
-inline match_combine_or<UnaryVPInstruction_match<Op0_t, VPInstruction::Not>,
-                        AllBinaryRecipe_match<int_pred_ty<is_all_ones>, Op0_t,
-                                              Instruction::Xor, true>>
+inline match_combine_or<VPInstruction_match<VPInstruction::Not, Op0_t>,
+                        AllRecipe_commutative_match<
+                            Instruction::Xor, int_pred_ty<is_all_ones>, Op0_t>>
 m_Not(const Op0_t &Op0) {
   return m_CombineOr(m_VPInstruction<VPInstruction::Not>(Op0),
                      m_c_Binary<Instruction::Xor>(m_AllOnes(), Op0));
@@ -556,9 +479,8 @@ m_Not(const Op0_t &Op0) {
 
 template <typename Op0_t, typename Op1_t>
 inline match_combine_or<
-    BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::LogicalAnd>,
-    AllTernaryRecipe_match<Op0_t, Op1_t, specific_intval<1>,
-                           Instruction::Select>>
+    VPInstruction_match<VPInstruction::LogicalAnd, Op0_t, Op1_t>,
+    AllRecipe_match<Instruction::Select, Op0_t, Op1_t, specific_intval<1>>>
 m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
   return m_CombineOr(
       m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1),
@@ -566,15 +488,14 @@ m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
 }
 
 template <typename Op0_t, typename Op1_t>
-inline AllTernaryRecipe_match<Op0_t, specific_intval<1>, Op1_t,
-                              Instruction::Select>
+inline AllRecipe_match<Instruction::Select, Op0_t, specific_intval<1>, Op1_t>
 m_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) {
   return m_Select(Op0, m_True(), Op1);
 }
 
 template <typename Op0_t, typename Op1_t, typename Op2_t>
-using VPScalarIVSteps_match =
-    TernaryRecipe_match<Op0_t, Op1_t, Op2_t, 0, false, VPScalarIVStepsRecipe>;
+using VPScalarIVSteps_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, 0,
+                                           false, VPScalarIVStepsRecipe>;
 
 template <typename Op0_t, typename Op1_t, typename Op2_t>
 inline VPScalarIVSteps_match<Op0_t, Op1_t, Op2_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 862b930..cdadc33 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -238,14 +238,11 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
     // optimizations will clean it up.
 
     SmallVector<VPValue *, 2> OperandsWithMask;
-    unsigned NumIncoming = PhiR->getNumIncoming();
-    for (unsigned In = 0; In < NumIncoming; In++) {
-      const VPBasicBlock *Pred = PhiR->getIncomingBlock(In);
-      OperandsWithMask.push_back(PhiR->getIncomingValue(In));
-      VPValue *EdgeMask = getEdgeMask(Pred, VPBB);
+    for (const auto &[InVPV, InVPBB] : PhiR->incoming_values_and_blocks()) {
+      OperandsWithMask.push_back(InVPV);
+      VPValue *EdgeMask = getEdgeMask(InVPBB, VPBB);
       if (!EdgeMask) {
-        assert(In == 0 && "Both null and non-null edge masks found");
-        assert(all_equal(PhiR->operands()) &&
+        assert(all_equal(PhiR->incoming_values()) &&
                "Distinct incoming values with one having a full mask");
         break;
       }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e971ba1..7ca9b23 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -452,6 +452,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
 
   switch (Opcode) {
   case VPInstruction::StepVector:
+  case VPInstruction::VScale:
     return 0;
   case Instruction::Alloca:
   case Instruction::ExtractValue:
@@ -459,6 +460,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case Instruction::Load:
   case VPInstruction::AnyOf:
   case VPInstruction::BranchOnCond:
+  case VPInstruction::BuildStructVector:
+  case VPInstruction::BuildVector:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExplicitVectorLength:
@@ -517,6 +520,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::AnyOf:
+  case VPInstruction::Not:
     return true;
   default:
     return false;
@@ -569,7 +573,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
   switch (getOpcode()) {
   case VPInstruction::Not: {
-    Value *A = State.get(getOperand(0));
+    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
+    Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
     return Builder.CreateNot(A, Name);
   }
   case Instruction::ExtractElement: {
@@ -810,10 +815,18 @@ Value *VPInstruction::generate(VPTransformState &State) {
         Value *RdxPart = RdxParts[Part];
         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
-        else
-          ReducedPartRdx = Builder.CreateBinOp(
-              (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
-              RdxPart, ReducedPartRdx, "bin.rdx");
+        else {
+          Instruction::BinaryOps Opcode;
+          // For sub-recurrences, each UF's reduction variable is already
+          // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
+          if (RK == RecurKind::Sub)
+            Opcode = Instruction::Add;
+          else
+            Opcode =
+                (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
+          ReducedPartRdx =
+              Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
+        }
       }
     }
 
@@ -922,6 +935,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
     return Res;
   }
+  case VPInstruction::ResumeForEpilogue:
+    return State.get(getOperand(0), true);
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -998,6 +1013,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
                                   I32Ty, {Arg0Ty, I32Ty, I1Ty});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
+  case VPInstruction::ExtractLastElement: {
+    // Add on the cost of extracting the element.
+    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+    return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
+                                                    VecTy, Ctx.CostKind, 0);
+  }
   case VPInstruction::ExtractPenultimateElement:
     if (VF == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();
@@ -1027,6 +1048,8 @@ bool VPInstruction::isSingleScalar() const {
   switch (getOpcode()) {
   case Instruction::PHI:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::ResumeForEpilogue:
+  case VPInstruction::VScale:
     return true;
   default:
     return isScalarCast();
@@ -1076,6 +1099,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
+  case Instruction::PHI:
   case VPInstruction::AnyOf:
   case VPInstruction::BuildStructVector:
   case VPInstruction::BuildVector:
@@ -1093,6 +1117,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::WidePtrAdd:
   case VPInstruction::StepVector:
   case VPInstruction::ReductionStartVector:
+  case VPInstruction::VScale:
     return false;
   default:
     return true;
@@ -1116,6 +1141,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case Instruction::Select:
   case Instruction::Or:
   case Instruction::Freeze:
+  case VPInstruction::Not:
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
   case VPInstruction::ActiveLaneMask:
@@ -1251,6 +1277,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ReductionStartVector:
     O << "reduction-start-vector";
     break;
+  case VPInstruction::ResumeForEpilogue:
+    O << "resume-for-epilogue";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -1281,6 +1310,12 @@ void VPInstructionWithType::execute(VPTransformState &State) {
     State.set(this, StepVector);
     break;
   }
+  case VPInstruction::VScale: {
+    Value *VScale = State.Builder.CreateVScale(ResultTy);
+    State.set(this, VScale, true);
+    break;
+  }
+
   default:
     llvm_unreachable("opcode not implemented yet");
   }
@@ -1301,6 +1336,9 @@ void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::StepVector:
     O << "step-vector " << *ResultTy;
     break;
+  case VPInstruction::VScale:
+    O << "vscale " << *ResultTy;
+    break;
   default:
     assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
     O << Instruction::getOpcodeName(getOpcode()) << " ";
@@ -1434,12 +1472,12 @@ void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
 
   if (getNumOperands() != 0) {
     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
-    interleaveComma(
-        enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
-          Op.value()->printAsOperand(O, SlotTracker);
-          O << " from ";
-          getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
-        });
+    interleaveComma(incoming_values_and_blocks(), O,
+                    [&O, &SlotTracker](auto Op) {
+                      std::get<0>(Op)->printAsOperand(O, SlotTracker);
+                      O << " from ";
+                      std::get<1>(Op)->printAsOperand(O);
+                    });
     O << ")";
   }
 }
@@ -2934,7 +2972,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   // transform, avoid computing their cost multiple times for now.
   Ctx.SkipCostComputation.insert(UI);
 
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   Type *ResultTy = Ctx.Types.inferScalarType(this);
   switch (UI->getOpcode()) {
   case Instruction::GetElementPtr:
@@ -2943,6 +2980,24 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     // is scalarized or not. Therefore, we handle GEPs with the memory
     // instruction cost.
     return 0;
+  case Instruction::Call: {
+    if (!isSingleScalar()) {
+      // TODO: Handle remaining call costs here as well.
+      if (VF.isScalable())
+        return InstructionCost::getInvalid();
+      break;
+    }
+
+    auto *CalledFn =
+        cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
+    if (CalledFn->isIntrinsic())
+      break;
+
+    SmallVector<Type *, 4> Tys;
+    for (VPValue *ArgOp : drop_end(operands()))
+      Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
+    return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+  }
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::FAdd:
@@ -2960,7 +3015,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     auto Op2Info = Ctx.getOperandInfo(getOperand(1));
     SmallVector<const Value *, 4> Operands(UI->operand_values());
     return Ctx.TTI.getArithmeticInstrCost(
-               UI->getOpcode(), ResultTy, CostKind,
+               UI->getOpcode(), ResultTy, Ctx.CostKind,
                {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
                Op2Info, Operands, UI, &Ctx.TLI) *
            (isSingleScalar() ? 1 : VF.getFixedValue());
@@ -3097,9 +3152,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     // Currently, ARM will use the underlying IR to calculate gather/scatter
     // instruction cost.
     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+    Type *PtrTy = toVectorTy(Ptr->getType(), VF);
     assert(!Reverse &&
            "Inconsecutive memory access should not have the order.");
-    return Ctx.TTI.getAddressComputationCost(Ty) +
+    return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
+                                             Ctx.CostKind) +
            Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
                                           Ctx.CostKind, &Ingredient);
   }
@@ -3445,6 +3502,8 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "Interleave group being replicated.");
+  assert((!NeedsMaskForGaps || !State.VF.isScalable()) &&
+         "Masking gaps for scalable vectors is not yet supported.");
   const InterleaveGroup<Instruction> *Group = IG;
   Instruction *Instr = Group->getInsertPos();
 
@@ -3562,8 +3621,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
   assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) &&
          "Mismatch between NeedsMaskForGaps and MaskForGaps");
-  assert((!MaskForGaps || !State.VF.isScalable()) &&
-         "masking gaps for scalable vectors is not yet supported.");
   ArrayRef<VPValue *> StoredValues = getStoredValues();
   // Collect the stored vector from each member.
   SmallVector<Value *, 4> StoredVecs;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1c8bd6c..cff43c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -517,10 +517,7 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
     // everything WidenNewIV's users need. That is, WidenOriginalIV will
     // generate a vector phi or all users of WidenNewIV demand the first lane
     // only.
-    if (any_of(WidenOriginalIV->users(),
-               [WidenOriginalIV](VPUser *U) {
-                 return !U->usesScalars(WidenOriginalIV);
-               }) ||
+    if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
         vputils::onlyFirstLaneUsed(WidenNewIV)) {
       WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
       WidenNewIV->eraseFromParent();
@@ -553,8 +550,22 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
     // The recipes in the block are processed in reverse order, to catch chains
     // of dead recipes.
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (isDeadRecipe(R))
+      if (isDeadRecipe(R)) {
         R.eraseFromParent();
+        continue;
+      }
+
+      // Check if R is a dead VPPhi <-> update cycle and remove it.
+      auto *PhiR = dyn_cast<VPPhi>(&R);
+      if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
+        continue;
+      VPValue *Incoming = PhiR->getOperand(1);
+      if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
+          Incoming->getNumUsers() != 1)
+        continue;
+      PhiR->replaceAllUsesWith(PhiR->getOperand(0));
+      PhiR->eraseFromParent();
+      Incoming->getDefiningRecipe()->eraseFromParent();
     }
   }
 }
@@ -571,8 +582,7 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
       Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
 
   // Truncate base induction if needed.
-  Type *CanonicalIVType = CanonicalIV->getScalarType();
-  VPTypeAnalysis TypeInfo(CanonicalIVType);
+  VPTypeAnalysis TypeInfo(Plan);
   Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
   if (TruncI) {
     Type *TruncTy = TruncI->getType();
@@ -868,7 +878,7 @@ optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
 void VPlanTransforms::optimizeInductionExitUsers(
     VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
   VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
-  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+  VPTypeAnalysis TypeInfo(Plan);
   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
     for (VPRecipeBase &R : ExitVPBB->phis()) {
       auto *ExitIRI = cast<VPIRPhi>(&R);
@@ -970,10 +980,11 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
     return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
                           Ops[1],
                           cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
-  case Instruction::InsertElement:
-    return Folder.FoldInsertElement(Ops[0], Ops[1], Ops[2]);
+  // An extract of a live-in is an extract of a broadcast, so return the
+  // broadcasted element.
   case Instruction::ExtractElement:
-    return Folder.FoldExtractElement(Ops[0], Ops[1]);
+    assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
+    return Ops[0];
   }
   return nullptr;
 }
@@ -1041,7 +1052,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
 #ifndef NDEBUG
     // Verify that the cached type info is for both A and its users is still
     // accurate by comparing it to freshly computed types.
-    VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType());
+    VPTypeAnalysis TypeInfo2(*Plan);
     assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
     for (VPUser *U : A->users()) {
       auto *R = cast<VPRecipeBase>(U);
@@ -1202,9 +1213,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
-  if (match(Def,
-            m_VPInstruction<VPInstruction::ExtractLastElement>(
-                m_VPInstruction<VPInstruction::Broadcast>(m_VPValue(A))))) {
+  if (match(Def, m_VPInstruction<VPInstruction::ExtractLastElement>(
+                     m_Broadcast(m_VPValue(A))))) {
     Def->replaceAllUsesWith(A);
     return;
   }
@@ -1218,10 +1228,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   }
 }
 
-void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
+void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
-  VPTypeAnalysis TypeInfo(&CanonicalIVTy);
+  VPTypeAnalysis TypeInfo(Plan);
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       simplifyRecipe(R, TypeInfo);
@@ -1251,9 +1261,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
       // scalar results used. In the latter case, we would introduce extra
       // broadcasts.
       if (!vputils::isSingleScalar(RepOrWidenR) ||
-          any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
-            return !U->usesScalars(RepOrWidenR);
-          }))
+          !vputils::onlyScalarValuesUsed(RepOrWidenR))
         continue;
 
       auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
@@ -1485,7 +1493,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   // the region, otherwise replace the terminator controlling the latch with
   // (BranchOnCond true).
   auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
-  auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
   if (all_of(Header->phis(),
              IsaPred<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
                      VPFirstOrderRecurrencePHIRecipe, VPPhi>)) {
@@ -1505,7 +1512,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
 
     VPBlockUtils::connectBlocks(Preheader, Header);
     VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
-    VPlanTransforms::simplifyRecipes(Plan, *CanIVTy);
+    VPlanTransforms::simplifyRecipes(Plan);
   } else {
     // The vector region contains header phis for which we cannot remove the
     // loop region yet.
@@ -1748,7 +1755,8 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
     if (!PhiR)
       continue;
     RecurKind RK = PhiR->getRecurrenceKind();
-    if (RK != RecurKind::Add && RK != RecurKind::Mul)
+    if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
+        RK != RecurKind::AddChainWithSubs)
       continue;
 
     for (VPUser *U : collectUsersRecursively(PhiR))
@@ -1799,8 +1807,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
   // other uses have different types for their operands, making them invalidly
   // typed.
   DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
-  Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
-  VPTypeAnalysis TypeInfo(CanonicalIVType);
+  VPTypeAnalysis TypeInfo(Plan);
   VPBasicBlock *PH = Plan.getVectorPreheader();
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
@@ -1828,8 +1835,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
       assert(OldResTy->isIntegerTy() && "only integer types supported");
       (void)OldResSizeInBits;
 
-      LLVMContext &Ctx = CanonicalIVType->getContext();
-      auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
+      auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
 
       // Any wrapping introduced by shrinking this operation shouldn't be
       // considered undefined behavior. So, we can't unconditionally copy
@@ -1920,13 +1926,13 @@ void VPlanTransforms::optimize(VPlan &Plan) {
   runPass(removeRedundantCanonicalIVs, Plan);
   runPass(removeRedundantInductionCasts, Plan);
 
-  runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
+  runPass(simplifyRecipes, Plan);
   runPass(simplifyBlends, Plan);
   runPass(removeDeadRecipes, Plan);
   runPass(narrowToSingleScalarRecipes, Plan);
   runPass(legalizeAndOptimizeInductions, Plan);
   runPass(removeRedundantExpandSCEVRecipes, Plan);
-  runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
+  runPass(simplifyRecipes, Plan);
   runPass(removeBranchOnConst, Plan);
   runPass(removeDeadRecipes, Plan);
 
@@ -2039,11 +2045,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   return LaneMaskPhi;
 }
 
-/// Collect all VPValues representing a header mask through the (ICMP_ULE,
-/// WideCanonicalIV, backedge-taken-count) pattern.
+/// Collect the header mask with the pattern:
+///   (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
 /// TODO: Introduce explicit recipe for header-mask instead of searching
 /// for the header-mask pattern manually.
-static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
+static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
   SmallVector<VPValue *> WideCanonicalIVs;
   auto *FoundWidenCanonicalIVUser =
       find_if(Plan.getCanonicalIV()->users(),
@@ -2067,21 +2073,22 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
       WideCanonicalIVs.push_back(WidenOriginalIV);
   }
 
-  // Walk users of wide canonical IVs and collect to all compares of the form
+  // Walk users of wide canonical IVs and find the single compare of the form
   // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
-  SmallVector<VPValue *> HeaderMasks;
+  VPSingleDefRecipe *HeaderMask = nullptr;
   for (auto *Wide : WideCanonicalIVs) {
     for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
-      auto *HeaderMask = dyn_cast<VPInstruction>(U);
-      if (!HeaderMask || !vputils::isHeaderMask(HeaderMask, Plan))
+      auto *VPI = dyn_cast<VPInstruction>(U);
+      if (!VPI || !vputils::isHeaderMask(VPI, Plan))
         continue;
 
-      assert(HeaderMask->getOperand(0) == Wide &&
+      assert(VPI->getOperand(0) == Wide &&
              "WidenCanonicalIV must be the first operand of the compare");
-      HeaderMasks.push_back(HeaderMask);
+      assert(!HeaderMask && "Multiple header masks found?");
+      HeaderMask = VPI;
     }
   }
-  return HeaderMasks;
+  return HeaderMask;
 }
 
 void VPlanTransforms::addActiveLaneMask(
@@ -2097,6 +2104,7 @@ void VPlanTransforms::addActiveLaneMask(
               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
   assert(FoundWidenCanonicalIVUser &&
          "Must have widened canonical IV when tail folding!");
+  VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
   auto *WideCanonicalIV =
       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
   VPSingleDefRecipe *LaneMask;
@@ -2110,11 +2118,11 @@ void VPlanTransforms::addActiveLaneMask(
                               "active.lane.mask");
   }
 
-  // Walk users of WideCanonicalIV and replace all compares of the form
-  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
-  // active-lane-mask.
-  for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
-    HeaderMask->replaceAllUsesWith(LaneMask);
+  // Walk users of WideCanonicalIV and replace the header mask of the form
+  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
+  // removing the old one to ensure there is always only a single header mask.
+  HeaderMask->replaceAllUsesWith(LaneMask);
+  HeaderMask->eraseFromParent();
 }
 
 /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
@@ -2130,6 +2138,8 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
                                        VPRecipeBase &CurRecipe,
                                        VPTypeAnalysis &TypeInfo,
                                        VPValue &AllOneMask, VPValue &EVL) {
+  // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
+  // header mask.
   auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
     assert(OrigMask && "Unmasked recipe when folding tail");
     // HeaderMask will be handled using EVL.
@@ -2139,14 +2149,35 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
     return HeaderMask == OrigMask ? nullptr : OrigMask;
   };
 
+  /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
+  auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
+    auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
+    if (!EndPtr)
+      return Addr;
+    assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
+           "VPVectorEndPointerRecipe with non-VF VF operand?");
+    assert(
+        all_of(EndPtr->users(),
+               [](VPUser *U) {
+                 return cast<VPWidenMemoryRecipe>(U)->isReverse();
+               }) &&
+        "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
+    VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
+    EVLAddr->insertBefore(&CurRecipe);
+    EVLAddr->setOperand(1, &EVL);
+    return EVLAddr;
+  };
+
   return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
       .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
         VPValue *NewMask = GetNewMask(L->getMask());
-        return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
+        VPValue *NewAddr = GetNewAddr(L->getAddr());
+        return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
       })
       .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
         VPValue *NewMask = GetNewMask(S->getMask());
-        return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
+        VPValue *NewAddr = GetNewAddr(S->getAddr());
+        return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
       })
       .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
         VPValue *NewMask = GetNewMask(Red->getCondOp());
@@ -2172,9 +2203,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
 
 /// Replace recipes with their EVL variants.
 static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
-  Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
-  VPTypeAnalysis TypeInfo(CanonicalIVType);
-  LLVMContext &Ctx = CanonicalIVType->getContext();
+  VPTypeAnalysis TypeInfo(Plan);
   VPValue *AllOneMask = Plan.getTrue();
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
@@ -2183,7 +2212,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
                         VPWidenIntOrFpInductionRecipe>) &&
          "User of VF that we can't transform to EVL.");
-  Plan.getVF().replaceAllUsesWith(&EVL);
+  Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
+    return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
+  });
 
   assert(all_of(Plan.getVFxUF().users(),
                 [&Plan](VPUser *U) {
@@ -2213,9 +2244,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     VPValue *MaxEVL = &Plan.getVF();
     // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
     VPBuilder Builder(LoopRegion->getPreheaderVPBB());
-    MaxEVL = Builder.createScalarZExtOrTrunc(MaxEVL, Type::getInt32Ty(Ctx),
-                                             TypeInfo.inferScalarType(MaxEVL),
-                                             DebugLoc());
+    MaxEVL = Builder.createScalarZExtOrTrunc(
+        MaxEVL, Type::getInt32Ty(Plan.getContext()),
+        TypeInfo.inferScalarType(MaxEVL), DebugLoc());
 
     Builder.setInsertPoint(Header, Header->getFirstNonPhi());
     VPValue *PrevEVL =
@@ -2230,7 +2261,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                        m_VPValue(V1), m_VPValue(V2))))
           continue;
         VPValue *Imm = Plan.getOrAddLiveIn(
-            ConstantInt::getSigned(Type::getInt32Ty(Ctx), -1));
+            ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));
         VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
             Intrinsic::experimental_vp_splice,
             {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
@@ -2242,47 +2273,51 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     }
   }
 
+  VPValue *HeaderMask = findHeaderMask(Plan);
+  if (!HeaderMask)
+    return;
+
+  // Replace header masks with a mask equivalent to predicating by EVL:
+  //
+  // icmp ule widen-canonical-iv backedge-taken-count
+  // ->
+  // icmp ult step-vector, EVL
+  VPRecipeBase *EVLR = EVL.getDefiningRecipe();
+  VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
+  Type *EVLType = TypeInfo.inferScalarType(&EVL);
+  VPValue *EVLMask = Builder.createICmp(
+      CmpInst::ICMP_ULT,
+      Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
+  HeaderMask->replaceAllUsesWith(EVLMask);
+  ToErase.push_back(HeaderMask->getDefiningRecipe());
+
   // Try to optimize header mask recipes away to their EVL variants.
-  for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
-    // TODO: Split optimizeMaskToEVL out and move into
-    // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
-    // tryToBuildVPlanWithVPRecipes beforehand.
-    for (VPUser *U : collectUsersRecursively(HeaderMask)) {
-      auto *CurRecipe = cast<VPRecipeBase>(U);
-      VPRecipeBase *EVLRecipe =
-          optimizeMaskToEVL(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
-      if (!EVLRecipe)
-        continue;
+  // TODO: Split optimizeMaskToEVL out and move into
+  // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
+  // tryToBuildVPlanWithVPRecipes beforehand.
+  for (VPUser *U : collectUsersRecursively(EVLMask)) {
+    auto *CurRecipe = cast<VPRecipeBase>(U);
+    VPRecipeBase *EVLRecipe =
+        optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
+    if (!EVLRecipe)
+      continue;
 
-      [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
-      assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
-             "New recipe must define the same number of values as the "
-             "original.");
-      assert(
-          NumDefVal <= 1 &&
-          "Only supports recipes with a single definition or without users.");
-      EVLRecipe->insertBefore(CurRecipe);
-      if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
-        VPValue *CurVPV = CurRecipe->getVPSingleValue();
-        CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
-      }
-      ToErase.push_back(CurRecipe);
+    [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
+    assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
+           "New recipe must define the same number of values as the "
+           "original.");
+    assert(NumDefVal <= 1 &&
+           "Only supports recipes with a single definition or without users.");
+    EVLRecipe->insertBefore(CurRecipe);
+    if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
+      VPValue *CurVPV = CurRecipe->getVPSingleValue();
+      CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
     }
-
-    // Replace header masks with a mask equivalent to predicating by EVL:
-    //
-    // icmp ule widen-canonical-iv backedge-taken-count
-    // ->
-    // icmp ult step-vector, EVL
-    VPRecipeBase *EVLR = EVL.getDefiningRecipe();
-    VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
-    Type *EVLType = TypeInfo.inferScalarType(&EVL);
-    VPValue *EVLMask = Builder.createICmp(
-        CmpInst::ICMP_ULT,
-        Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
-    HeaderMask->replaceAllUsesWith(EVLMask);
-    ToErase.push_back(HeaderMask->getDefiningRecipe());
+    ToErase.push_back(CurRecipe);
   }
+  // Remove dead EVL mask.
+  if (EVLMask->getNumUsers() == 0)
+    ToErase.push_back(EVLMask->getDefiningRecipe());
 
   for (VPRecipeBase *R : reverse(ToErase)) {
     SmallVector<VPValue *> PossiblyDead(R->operands());
@@ -2368,7 +2403,7 @@ void VPlanTransforms::addExplicitVectorLength(
   Builder.setInsertPoint(CanonicalIVIncrement);
   VPValue *OpVPEVL = VPEVL;
 
-  auto *I32Ty = Type::getInt32Ty(CanIVTy->getContext());
+  auto *I32Ty = Type::getInt32Ty(Plan.getContext());
   OpVPEVL = Builder.createScalarZExtOrTrunc(
       OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
 
@@ -2579,10 +2614,10 @@ void VPlanTransforms::createInterleaveGroups(
     auto *InsertPos =
         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
 
-    bool InBounds = false;
+    GEPNoWrapFlags NW = GEPNoWrapFlags::none();
     if (auto *Gep = dyn_cast<GetElementPtrInst>(
             getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
-      InBounds = Gep->isInBounds();
+      NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
 
     // Get or create the start address for the interleave group.
     auto *Start =
@@ -2606,8 +2641,7 @@ void VPlanTransforms::createInterleaveGroups(
       VPValue *OffsetVPV =
           Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
       VPBuilder B(InsertPos);
-      Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
-                      : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
+      Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
     }
     // If the group is reverse, adjust the index to refer to the last vector
     // lane instead of the first. We adjust the index from the first vector
@@ -2616,9 +2650,7 @@ void VPlanTransforms::createInterleaveGroups(
     if (IG->isReverse()) {
       auto *ReversePtr = new VPVectorEndPointerRecipe(
           Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
-          -(int64_t)IG->getFactor(),
-          InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(),
-          InsertPos->getDebugLoc());
+          -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
       ReversePtr->insertBefore(InsertPos);
       Addr = ReversePtr;
     }
@@ -2711,7 +2743,7 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
 
   // Construct the initial value of the vector IV in the vector loop preheader.
   Type *IVIntTy =
-      IntegerType::get(StepTy->getContext(), StepTy->getScalarSizeInBits());
+      IntegerType::get(Plan->getContext(), StepTy->getScalarSizeInBits());
   VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
   if (StepTy->isFloatingPointTy())
     Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
@@ -2838,9 +2870,8 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
     R->dissolveToCFGLoop();
 }
 
-void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
-                                               Type &CanonicalIVTy) {
-  VPTypeAnalysis TypeInfo(&CanonicalIVTy);
+void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
+  VPTypeAnalysis TypeInfo(Plan);
   SmallVector<VPRecipeBase *> ToRemove;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getEntry()))) {
@@ -3204,8 +3235,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
 
   auto *VectorPreheader = Plan.getVectorPreheader();
   for (VPValue *VPV : VPValues) {
-    if (all_of(VPV->users(),
-               [VPV](VPUser *U) { return U->usesScalars(VPV); }) ||
+    if (vputils::onlyScalarValuesUsed(VPV) ||
         (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
          isa<Constant>(VPV->getLiveInIRValue())))
       continue;
@@ -3278,6 +3308,149 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
   BTC->replaceAllUsesWith(TCMO);
 }
 
+void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
+  if (Plan.hasScalarVFOnly())
+    return;
+
+  VPTypeAnalysis TypeInfo(Plan);
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+      vp_depth_first_shallow(Plan.getEntry()));
+  auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+      vp_depth_first_shallow(LoopRegion->getEntry()));
+  // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
+  // excluding ones in replicate regions. Those are not materialized explicitly
+  // yet. Those vector users are still handled in VPReplicateRegion::execute(),
+  // via shouldPack().
+  // TODO: materialize build vectors for replicating recipes in replicating
+  // regions.
+  // TODO: materialize build vectors for VPInstructions.
+  for (VPBasicBlock *VPBB :
+       concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+      auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
+        VPRegionBlock *ParentRegion =
+            cast<VPRecipeBase>(U)->getParent()->getParent();
+        return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
+      };
+      if (!RepR || RepR->isSingleScalar() ||
+          none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
+        continue;
+
+      Type *ScalarTy = TypeInfo.inferScalarType(RepR);
+      unsigned Opcode = ScalarTy->isStructTy()
+                            ? VPInstruction::BuildStructVector
+                            : VPInstruction::BuildVector;
+      auto *BuildVector = new VPInstruction(Opcode, {RepR});
+      BuildVector->insertAfter(RepR);
+
+      RepR->replaceUsesWithIf(
+          BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
+                           VPUser &U, unsigned) {
+            return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
+          });
+    }
+  }
+}
+
+void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
+                                                 VPBasicBlock *VectorPHVPBB,
+                                                 bool TailByMasking,
+                                                 bool RequiresScalarEpilogue) {
+  VPValue &VectorTC = Plan.getVectorTripCount();
+  assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
+  // There's nothing to do if there are no users of the vector trip count or its
+  // IR value has already been set.
+  if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
+    return;
+
+  VPValue *TC = Plan.getTripCount();
+  Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
+  VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
+  VPValue *Step = &Plan.getVFxUF();
+
+  // If the tail is to be folded by masking, round the number of iterations N
+  // up to a multiple of Step instead of rounding down. This is done by first
+  // adding Step-1 and then rounding down. Note that it's ok if this addition
+  // overflows: the vector induction variable will eventually wrap to zero given
+  // that it starts at zero and its Step is a power of two; the loop will then
+  // exit, with the last early-exit vector comparison also producing all-true.
+  // For scalable vectors the VF is not guaranteed to be a power of 2, but this
+  // is accounted for in emitIterationCountCheck that adds an overflow check.
+  if (TailByMasking) {
+    TC = Builder.createNaryOp(
+        Instruction::Add,
+        {TC, Builder.createNaryOp(
+                 Instruction::Sub,
+                 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+        DebugLoc::getCompilerGenerated(), "n.rnd.up");
+  }
+
+  // Now we need to generate the expression for the part of the loop that the
+  // vectorized body will execute. This is equal to N - (N % Step) if scalar
+  // iterations are not required for correctness, or N - Step, otherwise. Step
+  // is equal to the vectorization factor (number of SIMD elements) times the
+  // unroll factor (number of SIMD instructions).
+  VPValue *R =
+      Builder.createNaryOp(Instruction::URem, {TC, Step},
+                           DebugLoc::getCompilerGenerated(), "n.mod.vf");
+
+  // There are cases where we *must* run at least one iteration in the remainder
+  // loop.  See the cost model for when this can happen.  If the step evenly
+  // divides the trip count, we set the remainder to be equal to the step. If
+  // the step does not evenly divide the trip count, no adjustment is necessary
+  // since there will already be scalar iterations. Note that the minimum
+  // iterations check ensures that N >= Step.
+  if (RequiresScalarEpilogue) {
+    assert(!TailByMasking &&
+           "requiring scalar epilogue is not supported with fail folding");
+    VPValue *IsZero = Builder.createICmp(
+        CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+    R = Builder.createSelect(IsZero, Step, R);
+  }
+
+  VPValue *Res = Builder.createNaryOp(
+      Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
+  VectorTC.replaceAllUsesWith(Res);
+}
+
+void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
+                                            ElementCount VFEC) {
+  VPBuilder Builder(VectorPH, VectorPH->begin());
+  Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+  VPValue &VF = Plan.getVF();
+  VPValue &VFxUF = Plan.getVFxUF();
+  // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
+  // used.
+  // TODO: Assert that they aren't used.
+
+  // If there are no users of the runtime VF, compute VFxUF by constant folding
+  // the multiplication of VF and UF.
+  if (VF.getNumUsers() == 0) {
+    VPValue *RuntimeVFxUF =
+        Builder.createElementCount(TCTy, VFEC * Plan.getUF());
+    VFxUF.replaceAllUsesWith(RuntimeVFxUF);
+    return;
+  }
+
+  // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
+  // vscale) * UF.
+  VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
+  if (!vputils::onlyScalarValuesUsed(&VF)) {
+    VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
+    VF.replaceUsesWithIf(
+        BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
+  }
+  VF.replaceAllUsesWith(RuntimeVF);
+
+  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+  VPValue *MulByUF = Plan.getUF() == 1 ? RuntimeVF
+                                       : Builder.createNaryOp(Instruction::Mul,
+                                                              {RuntimeVF, UF});
+  VFxUF.replaceAllUsesWith(MulByUF);
+}
+
 /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
 /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
 /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
@@ -3346,9 +3519,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   if (VF.isScalable() || !VectorLoop)
     return;
 
-  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
-  Type *CanonicalIVType = CanonicalIV->getScalarType();
-  VPTypeAnalysis TypeInfo(CanonicalIVType);
+  VPTypeAnalysis TypeInfo(Plan);
 
   unsigned FixedVF = VF.getFixedValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index cc50c75..5b3d18b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -54,21 +54,30 @@ struct VPlanTransforms {
       verifyVPlanIsValid(Plan);
   }
 
-  LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan> buildPlainCFG(Loop *TheLoop,
-                                                                LoopInfo &LI);
-
-  /// Prepare the plan for vectorization. It will introduce a dedicated
-  /// VPBasicBlock for the vector pre-header as well as a VPBasicBlock as exit
-  /// block of the main vector loop (middle.block). If a check is needed to
-  /// guard executing the scalar epilogue loop, it will be added to the middle
-  /// block, together with VPBasicBlocks for the scalar preheader and exit
-  /// blocks. \p InductionTy is the type of the canonical induction and used for
-  /// related values, like the trip count expression.  It also creates a VPValue
-  /// expression for the original trip count.
-  LLVM_ABI_FOR_TEST static void prepareForVectorization(
-      VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE,
-      bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop,
-      DebugLoc IVDL, bool HasUncountableExit, VFRange &Range);
+  /// Create a base VPlan0, serving as the common starting point for all later
+  /// candidates. It consists of an initial plain CFG loop with loop blocks from
+  /// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction
+  /// corresponding to the input IR.
+  ///
+  /// The created loop is wrapped in an initial skeleton to facilitate
+  /// vectorization, consisting of a vector pre-header, an exit block for the
+  /// main vector loop (middle.block) and a new block as preheader of the scalar
+  /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p
+  /// InductionTy and \p IVDL, and creates a VPValue expression for the original
+  /// trip count.
+  LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
+  buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
+              PredicatedScalarEvolution &PSE);
+
+  /// Update \p Plan to account for all early exits.
+  LLVM_ABI_FOR_TEST static void
+  handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range);
+
+  /// If a check is needed to guard executing the scalar epilogue loop, it will
+  /// be added to the middle block.
+  LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan,
+                                               bool RequiresScalarEpilogueCheck,
+                                               bool TailFolded);
 
   /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
   /// flat CFG into a hierarchical CFG.
@@ -113,7 +122,7 @@ struct VPlanTransforms {
   static void clearReductionWrapFlags(VPlan &Plan);
 
   /// Explicitly unroll \p Plan by \p UF.
-  static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
+  static void unrollByUF(VPlan &Plan, unsigned UF);
 
   /// Replace each VPReplicateRecipe outside on any replicate region in \p Plan
   /// with \p VF single-scalar recipes.
@@ -220,9 +229,8 @@ struct VPlanTransforms {
   ///    EVLIVInc, TripCount).
   static void canonicalizeEVLLoops(VPlan &Plan);
 
-  /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
-  /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
-  static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
+  /// Lower abstract recipes to concrete ones, that can be codegen'd.
+  static void convertToConcreteRecipes(VPlan &Plan);
 
   /// This function converts initial recipes to the abstract recipes and clamps
   /// \p Range based on cost model for following optimizations and cost
@@ -231,9 +239,8 @@ struct VPlanTransforms {
   static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
                                        VFRange &Range);
 
-  /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p
-  /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
-  static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy);
+  /// Perform instcombine-like simplifications on recipes in \p Plan.
+  static void simplifyRecipes(VPlan &Plan);
 
   /// Remove BranchOnCond recipes with true or false conditions together with
   /// removing dead edges to their successors.
@@ -256,11 +263,25 @@ struct VPlanTransforms {
                                      unsigned BestUF,
                                      PredicatedScalarEvolution &PSE);
 
+  /// Materialize vector trip count computations to a set of VPInstructions.
+  static void materializeVectorTripCount(VPlan &Plan,
+                                         VPBasicBlock *VectorPHVPBB,
+                                         bool TailByMasking,
+                                         bool RequiresScalarEpilogue);
+
   /// Materialize the backedge-taken count to be computed explicitly using
   /// VPInstructions.
   static void materializeBackedgeTakenCount(VPlan &Plan,
                                             VPBasicBlock *VectorPH);
 
+  /// Add explicit Build[Struct]Vector recipes that combine multiple scalar
+  /// values into single vectors.
+  static void materializeBuildVectors(VPlan &Plan);
+
+  /// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
+  static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
+                                    ElementCount VF);
+
   /// Try to convert a plan with interleave groups with VF elements to a plan
   /// with the interleave groups replaced by wide loads and stores processing VF
   /// elements, if all transformed interleave groups access the full vector
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index fc072de..62fd83a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -74,8 +74,7 @@ class UnrollState {
   }
 
 public:
-  UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
-      : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
+  UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {}
 
   void unrollBlock(VPBlockBase *VPB);
 
@@ -409,7 +408,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
   }
 }
 
-void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
+void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
   assert(UF > 0 && "Unroll factor must be positive");
   Plan.setUF(UF);
   auto Cleanup = make_scope_exit([&Plan]() {
@@ -431,7 +430,7 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
     return;
   }
 
-  UnrollState Unroller(Plan, UF, Ctx);
+  UnrollState Unroller(Plan, UF);
 
   // Iterate over all blocks in the plan starting from Entry, and unroll
   // recipes inside them. This includes the vector preheader and middle blocks,
@@ -465,10 +464,12 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
   VPlanTransforms::removeDeadRecipes(Plan);
 }
 
-/// Create a single-scalar clone of \p RepR for lane \p Lane.
-static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
-                                       Type *IdxTy, VPReplicateRecipe *RepR,
-                                       VPLane Lane) {
+/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p
+/// Def2LaneDefs to look up scalar definitions for operands of \RepR.
+static VPReplicateRecipe *
+cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
+             VPReplicateRecipe *RepR, VPLane Lane,
+             const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
   // Collect the operands at Lane, creating extracts as needed.
   SmallVector<VPValue *> NewOps;
   for (VPValue *Op : RepR->operands()) {
@@ -481,6 +482,14 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
           Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
       continue;
     }
+    // If Op is a definition that has been unrolled, directly use the clone for
+    // the corresponding lane.
+    auto LaneDefs = Def2LaneDefs.find(Op);
+    if (LaneDefs != Def2LaneDefs.end()) {
+      NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
+      continue;
+    }
+
     // Look through buildvector to avoid unnecessary extracts.
     if (match(Op, m_BuildVector())) {
       NewOps.push_back(
@@ -513,6 +522,13 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
       vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));
   auto VPBBsToUnroll =
       concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
+  // A mapping of current VPValue definitions to collections of new VPValues
+  // defined per lane. Serves to hook-up potential users of current VPValue
+  // definition that are replicated-per-VF later.
+  DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs;
+  // The removal of current recipes being replaced by new ones needs to be
+  // delayed after Def2LaneDefs is no longer in use.
+  SmallVector<VPRecipeBase *> ToRemove;
   for (VPBasicBlock *VPBB : VPBBsToUnroll) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
@@ -524,12 +540,12 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
         if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
             vputils::isSingleScalar(RepR->getOperand(1))) {
           // Stores to invariant addresses need to store the last lane only.
-          cloneForLane(Plan, Builder, IdxTy, RepR,
-                       VPLane::getLastLaneForVF(VF));
+          cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
+                       Def2LaneDefs);
         } else {
           // Create single-scalar version of RepR for all lanes.
           for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
-            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I));
+            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
         }
         RepR->eraseFromParent();
         continue;
@@ -537,23 +553,33 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
       /// Create single-scalar version of RepR for all lanes.
       SmallVector<VPValue *> LaneDefs;
       for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
-        LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
+        LaneDefs.push_back(
+            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs));
 
+      Def2LaneDefs[RepR] = LaneDefs;
       /// Users that only demand the first lane can use the definition for lane
       /// 0.
       RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
         return U.onlyFirstLaneUsed(RepR);
       });
 
-      // If needed, create a Build(Struct)Vector recipe to insert the scalar
-      // lane values into a vector.
-      Type *ResTy = RepR->getUnderlyingInstr()->getType();
-      VPValue *VecRes = Builder.createNaryOp(
-          ResTy->isStructTy() ? VPInstruction::BuildStructVector
-                              : VPInstruction::BuildVector,
-          LaneDefs);
-      RepR->replaceAllUsesWith(VecRes);
-      RepR->eraseFromParent();
+      // Update each build vector user that currently has RepR as its only
+      // operand, to have all LaneDefs as its operands.
+      for (VPUser *U : to_vector(RepR->users())) {
+        auto *VPI = dyn_cast<VPInstruction>(U);
+        if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
+                     VPI->getOpcode() != VPInstruction::BuildStructVector))
+          continue;
+        assert(VPI->getNumOperands() == 1 &&
+               "Build(Struct)Vector must have a single operand before "
+               "replicating by VF");
+        VPI->setOperand(0, LaneDefs[0]);
+        for (VPValue *LaneDef : drop_begin(LaneDefs))
+          VPI->addOperand(LaneDef);
+      }
+      ToRemove.push_back(RepR);
     }
   }
+  for (auto *R : reverse(ToRemove))
+    R->eraseFromParent();
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 14f20c6..b2230c4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -24,6 +24,11 @@ bool vputils::onlyFirstPartUsed(const VPValue *Def) {
                 [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); });
 }
 
+bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
+  return all_of(Def->users(),
+                [Def](const VPUser *U) { return U->usesScalars(Def); });
+}
+
 VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
                                                 ScalarEvolution &SE) {
   if (auto *Expanded = Plan.getSCEVExpansion(Expr))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 8dcd57f..3cf02b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -25,6 +25,9 @@ bool onlyFirstLaneUsed(const VPValue *Def);
 /// Returns true if only the first part of \p Def is used.
 bool onlyFirstPartUsed(const VPValue *Def);
 
+/// Returns true if only scalar values of \p Def are used by all users.
+bool onlyScalarValuesUsed(const VPValue *Def);
+
 /// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
 /// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
 /// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 3417e1c..e25ffe1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -183,6 +183,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
           case Instruction::ZExt:
           case Instruction::Mul:
           case Instruction::FMul:
+          case VPInstruction::Broadcast:
             // Opcodes above can only use EVL after wide inductions have been
             // expanded.
             if (!VerifyLate) {
@@ -250,17 +251,15 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
       for (const VPUser *U : V->users()) {
         auto *UI = cast<VPRecipeBase>(U);
         if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) {
-          for (unsigned Idx = 0; Idx != Phi->getNumIncoming(); ++Idx) {
-            VPValue *IncomingVPV = Phi->getIncomingValue(Idx);
+          for (const auto &[IncomingVPV, IncomingVPBB] :
+               Phi->incoming_values_and_blocks()) {
             if (IncomingVPV != V)
               continue;
 
-            const VPBasicBlock *IncomingVPBB = Phi->getIncomingBlock(Idx);
             if (VPDT.dominates(VPBB, IncomingVPBB))
               continue;
 
-            errs() << "Incoming def at index " << Idx
-                   << " does not dominate incoming block!\n";
+            errs() << "Incoming def does not dominate incoming block!\n";
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
             VPSlotTracker Tracker(VPBB->getPlan());
             IncomingVPV->getDefiningRecipe()->print(errs(), "  ", Tracker);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 6345b18..1275d53 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -29,11 +30,13 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <numeric>
+#include <optional>
 #include <queue>
 #include <set>
 
@@ -74,7 +77,7 @@ public:
                 const DataLayout *DL, TTI::TargetCostKind CostKind,
                 bool TryEarlyFoldsOnly)
       : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
-        DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind),
+        DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
         TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
 
   bool run();
@@ -88,6 +91,7 @@ private:
   AssumptionCache &AC;
   const DataLayout *DL;
   TTI::TargetCostKind CostKind;
+  const SimplifyQuery SQ;
 
   /// If true, only perform beneficial early IR transforms. Do not introduce new
   /// vector operations.
@@ -107,10 +111,8 @@ private:
                              const Instruction &I,
                              ExtractElementInst *&ConvertToShuffle,
                              unsigned PreferredExtractIndex);
-  void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
-                     Instruction &I);
-  void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
-                       Instruction &I);
+  Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
+  Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
   bool foldExtractExtract(Instruction &I);
   bool foldInsExtFNeg(Instruction &I);
   bool foldInsExtBinop(Instruction &I);
@@ -137,8 +139,10 @@ private:
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
   bool shrinkType(Instruction &I);
+  bool shrinkLoadForShuffles(Instruction &I);
+  bool shrinkPhiOfShuffles(Instruction &I);
 
-  void replaceValue(Value &Old, Value &New) {
+  void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
     LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
     LLVM_DEBUG(dbgs() << "         With: " << New << '\n');
     Old.replaceAllUsesWith(&New);
@@ -147,7 +151,11 @@ private:
       Worklist.pushUsersToWorkList(*NewI);
       Worklist.pushValue(NewI);
     }
-    Worklist.pushValue(&Old);
+    if (Erase && isInstructionTriviallyDead(&Old)) {
+      eraseInstruction(Old);
+    } else {
+      Worklist.push(&Old);
+    }
   }
 
   void eraseInstruction(Instruction &I) {
@@ -158,11 +166,23 @@ private:
 
     // Push remaining users of the operands and then the operand itself - allows
     // further folds that were hindered by OneUse limits.
-    for (Value *Op : Ops)
-      if (auto *OpI = dyn_cast<Instruction>(Op)) {
-        Worklist.pushUsersToWorkList(*OpI);
-        Worklist.pushValue(OpI);
+    SmallPtrSet<Value *, 4> Visited;
+    for (Value *Op : Ops) {
+      if (Visited.insert(Op).second) {
+        if (auto *OpI = dyn_cast<Instruction>(Op)) {
+          if (RecursivelyDeleteTriviallyDeadInstructions(
+                  OpI, nullptr, nullptr, [this](Value *V) {
+                    if (auto I = dyn_cast<Instruction>(V)) {
+                      LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
+                      Worklist.remove(I);
+                    }
+                  }))
+            continue;
+          Worklist.pushUsersToWorkList(*OpI);
+          Worklist.pushValue(OpI);
+        }
       }
+    }
   }
 };
 } // namespace
@@ -546,9 +566,8 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
 /// the source vector (shift the scalar element) to a NewIndex for extraction.
 /// Return null if the input can be constant folded, so that we are not creating
 /// unnecessary instructions.
-static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
-                                            unsigned NewIndex,
-                                            IRBuilderBase &Builder) {
+static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
+                               IRBuilderBase &Builder) {
   // Shufflevectors can only be created for fixed-width vectors.
   Value *X = ExtElt->getVectorOperand();
   if (!isa<FixedVectorType>(X->getType()))
@@ -563,52 +582,43 @@ static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
 
   Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
                                    NewIndex, Builder);
-  return dyn_cast<ExtractElementInst>(
-      Builder.CreateExtractElement(Shuf, NewIndex));
+  return Shuf;
 }
 
 /// Try to reduce extract element costs by converting scalar compares to vector
 /// compares followed by extract.
-/// cmp (ext0 V0, C), (ext1 V1, C)
-void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
-                                  ExtractElementInst *Ext1, Instruction &I) {
+/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
+Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
+                                    Instruction &I) {
   assert(isa<CmpInst>(&I) && "Expected a compare");
-  assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
-             cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
-         "Expected matching constant extract indexes");
 
-  // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
+  // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
+  //   --> extelt (cmp Pred V0, V1), ExtIndex
   ++NumVecCmp;
   CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
-  Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
   Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
-  Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
-  replaceValue(I, *NewExt);
+  return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
 }
 
 /// Try to reduce extract element costs by converting scalar binops to vector
 /// binops followed by extract.
-/// bo (ext0 V0, C), (ext1 V1, C)
-void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
-                                    ExtractElementInst *Ext1, Instruction &I) {
+/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
+Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
+                                      Instruction &I) {
   assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
-  assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
-             cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
-         "Expected matching constant extract indexes");
 
-  // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
+  // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
+  //   --> extelt (bo V0, V1), ExtIndex
   ++NumVecBO;
-  Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
-  Value *VecBO =
-      Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
+  Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
+                                     V1, "foldExtExtBinop");
 
   // All IR flags are safe to back-propagate because any potential poison
   // created in unused vector elements is discarded by the extract.
   if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
     VecBOInst->copyIRFlags(&I);
 
-  Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
-  replaceValue(I, *NewExt);
+  return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
 }
 
 /// Match an instruction with extracted vector operands.
@@ -647,25 +657,29 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
   if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
     return false;
 
+  Value *ExtOp0 = Ext0->getVectorOperand();
+  Value *ExtOp1 = Ext1->getVectorOperand();
+
   if (ExtractToChange) {
     unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
-    ExtractElementInst *NewExtract =
+    Value *NewExtOp =
         translateExtract(ExtractToChange, CheapExtractIdx, Builder);
-    if (!NewExtract)
+    if (!NewExtOp)
       return false;
     if (ExtractToChange == Ext0)
-      Ext0 = NewExtract;
+      ExtOp0 = NewExtOp;
     else
-      Ext1 = NewExtract;
+      ExtOp1 = NewExtOp;
   }
 
-  if (Pred != CmpInst::BAD_ICMP_PREDICATE)
-    foldExtExtCmp(Ext0, Ext1, I);
-  else
-    foldExtExtBinop(Ext0, Ext1, I);
-
+  Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
+                                            : Ext0->getIndexOperand();
+  Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
+                      ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
+                      : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
   Worklist.push(Ext0);
   Worklist.push(Ext1);
+  replaceValue(I, *NewExt);
   return true;
 }
 
@@ -1232,17 +1246,18 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
   // Fold the vector constants in the original vectors into a new base vector to
   // get more accurate cost modelling.
   Value *NewVecC = nullptr;
-  TargetFolder Folder(*DL);
   if (CI)
-    NewVecC = Folder.FoldCmp(CI->getPredicate(), VecCs[0], VecCs[1]);
+    NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
   else if (UO)
     NewVecC =
-        Folder.FoldUnOpFMF(UO->getOpcode(), VecCs[0], UO->getFastMathFlags());
+        simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
   else if (BO)
-    NewVecC = Folder.FoldBinOp(BO->getOpcode(), VecCs[0], VecCs[1]);
-  else if (II->arg_size() == 2)
-    NewVecC = Folder.FoldBinaryIntrinsic(II->getIntrinsicID(), VecCs[0],
-                                         VecCs[1], II->getType(), &I);
+    NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
+  else if (II)
+    NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
+
+  if (!NewVecC)
+    return false;
 
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.
@@ -1250,6 +1265,7 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
   InstructionCost NewCost =
       ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
                                             CostKind, *Index, NewVecC);
+
   for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
     if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
                               II->getIntrinsicID(), Idx, &TTI)))
@@ -1294,15 +1310,6 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
   if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
     ScalarInst->copyIRFlags(&I);
 
-  // Create a new base vector if the constant folding failed.
-  if (!NewVecC) {
-    if (CI)
-      NewVecC = Builder.CreateCmp(CI->getPredicate(), VecCs[0], VecCs[1]);
-    else if (UO || BO)
-      NewVecC = Builder.CreateNAryOp(Opcode, VecCs);
-    else
-      NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), VecCs);
-  }
   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
   replaceValue(I, *Insert);
   return true;
@@ -1790,7 +1797,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
     ScalarizedCost +=
         TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
                             Align(1), LI->getPointerAddressSpace(), CostKind);
-    ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType());
+    ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
+                                                    nullptr, nullptr, CostKind);
   }
 
   LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
@@ -1804,6 +1812,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   // erased in the correct order.
   Worklist.push(LI);
 
+  Type *ElemType = VecTy->getElementType();
+
   // Replace extracts with narrow scalar loads.
   for (User *U : LI->users()) {
     auto *EI = cast<ExtractElementInst>(U);
@@ -1817,14 +1827,20 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
     Builder.SetInsertPoint(EI);
     Value *GEP =
         Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
-    auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(
-        VecTy->getElementType(), GEP, EI->getName() + ".scalar"));
+    auto *NewLoad = cast<LoadInst>(
+        Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
 
-    Align ScalarOpAlignment = computeAlignmentAfterScalarization(
-        LI->getAlign(), VecTy->getElementType(), Idx, *DL);
+    Align ScalarOpAlignment =
+        computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
     NewLoad->setAlignment(ScalarOpAlignment);
 
-    replaceValue(*EI, *NewLoad);
+    if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
+      size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
+      AAMDNodes OldAAMD = LI->getAAMetadata();
+      NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
+    }
+
+    replaceValue(*EI, *NewLoad, false);
   }
 
   FailureGuard.release();
@@ -1856,15 +1872,15 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
   unsigned ExtCnt = 0;
   bool ExtLane0 = false;
   for (User *U : Ext->users()) {
-    const APInt *Idx;
-    if (!match(U, m_ExtractElt(m_Value(), m_APInt(Idx))))
+    uint64_t Idx;
+    if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
       return false;
     if (cast<Instruction>(U)->use_empty())
       continue;
     ExtCnt += 1;
-    ExtLane0 |= Idx->isZero();
+    ExtLane0 |= !Idx;
     VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
-                                         CostKind, Idx->getZExtValue(), U);
+                                         CostKind, Idx, U);
   }
 
   InstructionCost ScalarCost =
@@ -2910,7 +2926,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
       if (!IL.first)
         return true;
       Value *V = IL.first->get();
-      if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUse())
+      if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
         return false;
       if (V->getValueID() != FrontV->getValueID())
         return false;
@@ -3112,7 +3128,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
         Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
     LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
     replaceValue(*Shuffle, *NewShuffle);
-    MadeChanges = true;
+    return true;
   }
 
   // See if we can re-use foldSelectShuffle, getting it to reduce the size of
@@ -3608,7 +3624,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
     Builder.SetInsertPoint(Shuffles[S]);
     Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
-    replaceValue(*Shuffles[S], *NSV);
+    replaceValue(*Shuffles[S], *NSV, false);
   }
 
   Worklist.pushValue(NSV0A);
@@ -3861,6 +3877,228 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
+// Attempt to shrink loads that are only used by shufflevector instructions.
+bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
+  auto *OldLoad = dyn_cast<LoadInst>(&I);
+  if (!OldLoad || !OldLoad->isSimple())
+    return false;
+
+  auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
+  if (!OldLoadTy)
+    return false;
+
+  unsigned const OldNumElements = OldLoadTy->getNumElements();
+
+  // Search all uses of load. If all uses are shufflevector instructions, and
+  // the second operands are all poison values, find the minimum and maximum
+  // indices of the vector elements referenced by all shuffle masks.
+  // Otherwise return `std::nullopt`.
+  using IndexRange = std::pair<int, int>;
+  auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
+    IndexRange OutputRange = IndexRange(OldNumElements, -1);
+    for (llvm::Use &Use : I.uses()) {
+      // Ensure all uses match the required pattern.
+      User *Shuffle = Use.getUser();
+      ArrayRef<int> Mask;
+
+      if (!match(Shuffle,
+                 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
+        return std::nullopt;
+
+      // Ignore shufflevector instructions that have no uses.
+      if (Shuffle->use_empty())
+        continue;
+
+      // Find the min and max indices used by the shufflevector instruction.
+      for (int Index : Mask) {
+        if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
+          OutputRange.first = std::min(Index, OutputRange.first);
+          OutputRange.second = std::max(Index, OutputRange.second);
+        }
+      }
+    }
+
+    if (OutputRange.second < OutputRange.first)
+      return std::nullopt;
+
+    return OutputRange;
+  };
+
+  // Get the range of vector elements used by shufflevector instructions.
+  if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
+    unsigned const NewNumElements = Indices->second + 1u;
+
+    // If the range of vector elements is smaller than the full load, attempt
+    // to create a smaller load.
+    if (NewNumElements < OldNumElements) {
+      IRBuilder Builder(&I);
+      Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+      // Calculate costs of old and new ops.
+      Type *ElemTy = OldLoadTy->getElementType();
+      FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
+      Value *PtrOp = OldLoad->getPointerOperand();
+
+      InstructionCost OldCost = TTI.getMemoryOpCost(
+          Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
+          OldLoad->getPointerAddressSpace(), CostKind);
+      InstructionCost NewCost =
+          TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
+                              OldLoad->getPointerAddressSpace(), CostKind);
+
+      using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
+      SmallVector<UseEntry, 4u> NewUses;
+      unsigned const MaxIndex = NewNumElements * 2u;
+
+      for (llvm::Use &Use : I.uses()) {
+        auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+        ArrayRef<int> OldMask = Shuffle->getShuffleMask();
+
+        // Create entry for new use.
+        NewUses.push_back({Shuffle, OldMask});
+
+        // Validate mask indices.
+        for (int Index : OldMask) {
+          if (Index >= static_cast<int>(MaxIndex))
+            return false;
+        }
+
+        // Update costs.
+        OldCost +=
+            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
+                               OldLoadTy, OldMask, CostKind);
+        NewCost +=
+            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
+                               NewLoadTy, OldMask, CostKind);
+      }
+
+      LLVM_DEBUG(
+          dbgs() << "Found a load used only by shufflevector instructions: "
+                 << I << "\n  OldCost: " << OldCost
+                 << " vs NewCost: " << NewCost << "\n");
+
+      if (OldCost < NewCost || !NewCost.isValid())
+        return false;
+
+      // Create new load of smaller vector.
+      auto *NewLoad = cast<LoadInst>(
+          Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
+      NewLoad->copyMetadata(I);
+
+      // Replace all uses.
+      for (UseEntry &Use : NewUses) {
+        ShuffleVectorInst *Shuffle = Use.first;
+        std::vector<int> &NewMask = Use.second;
+
+        Builder.SetInsertPoint(Shuffle);
+        Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
+        Value *NewShuffle = Builder.CreateShuffleVector(
+            NewLoad, PoisonValue::get(NewLoadTy), NewMask);
+
+        replaceValue(*Shuffle, *NewShuffle, false);
+      }
+
+      return true;
+    }
+  }
+  return false;
+}
+
+// Attempt to narrow a phi of shufflevector instructions where the two incoming
+// values have the same operands but different masks. If the two shuffle masks
+// are offsets of one another we can use one branch to rotate the incoming
+// vector and perform one larger shuffle after the phi.
+bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
+  auto *Phi = dyn_cast<PHINode>(&I);
+  if (!Phi || Phi->getNumIncomingValues() != 2u)
+    return false;
+
+  Value *Op = nullptr;
+  ArrayRef<int> Mask0;
+  ArrayRef<int> Mask1;
+
+  if (!match(Phi->getOperand(0u),
+             m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
+      !match(Phi->getOperand(1u),
+             m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
+    return false;
+
+  auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
+
+  // Ensure result vectors are wider than the argument vector.
+  auto *InputVT = cast<FixedVectorType>(Op->getType());
+  auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
+  auto const InputNumElements = InputVT->getNumElements();
+
+  if (InputNumElements >= ResultVT->getNumElements())
+    return false;
+
+  // Take the difference of the two shuffle masks at each index. Ignore poison
+  // values at the same index in both masks.
+  SmallVector<int, 16> NewMask;
+  NewMask.reserve(Mask0.size());
+
+  for (auto [M0, M1] : zip(Mask0, Mask1)) {
+    if (M0 >= 0 && M1 >= 0)
+      NewMask.push_back(M0 - M1);
+    else if (M0 == -1 && M1 == -1)
+      continue;
+    else
+      return false;
+  }
+
+  // Ensure all elements of the new mask are equal. If the difference between
+  // the incoming mask elements is the same, the two must be constant offsets
+  // of one another.
+  if (NewMask.empty() || !all_equal(NewMask))
+    return false;
+
+  // Create new mask using difference of the two incoming masks.
+  int MaskOffset = NewMask[0u];
+  unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
+  NewMask.clear();
+
+  for (unsigned I = 0u; I < InputNumElements; ++I) {
+    NewMask.push_back(Index);
+    Index = (Index + 1u) % InputNumElements;
+  }
+
+  // Calculate costs for worst cases and compare.
+  auto const Kind = TTI::SK_PermuteSingleSrc;
+  auto OldCost =
+      std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
+               TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
+  auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
+                 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
+
+  LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
+                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
+
+  if (NewCost > OldCost)
+    return false;
+
+  // Create new shuffles and narrowed phi.
+  auto Builder = IRBuilder(Shuf);
+  Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
+  auto *PoisonVal = PoisonValue::get(InputVT);
+  auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
+  Worklist.push(cast<Instruction>(NewShuf0));
+
+  Builder.SetInsertPoint(Phi);
+  Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
+  auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
+  NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
+  NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
+
+  Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
+  PoisonVal = PoisonValue::get(NewPhi->getType());
+  auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
+
+  replaceValue(*Phi, *NewShuf1);
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -3873,8 +4111,7 @@ bool VectorCombine::run() {
 
   LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
 
-  bool MadeChange = false;
-  auto FoldInst = [this, &MadeChange](Instruction &I) {
+  auto FoldInst = [this](Instruction &I) {
     Builder.SetInsertPoint(&I);
     bool IsVectorType = isa<VectorType>(I.getType());
     bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
@@ -3889,10 +4126,12 @@ bool VectorCombine::run() {
     if (IsFixedVectorType) {
       switch (Opcode) {
       case Instruction::InsertElement:
-        MadeChange |= vectorizeLoadInsert(I);
+        if (vectorizeLoadInsert(I))
+          return true;
         break;
       case Instruction::ShuffleVector:
-        MadeChange |= widenSubvectorLoad(I);
+        if (widenSubvectorLoad(I))
+          return true;
         break;
       default:
         break;
@@ -3902,19 +4141,25 @@ bool VectorCombine::run() {
     // This transform works with scalable and fixed vectors
     // TODO: Identify and allow other scalable transforms
     if (IsVectorType) {
-      MadeChange |= scalarizeOpOrCmp(I);
-      MadeChange |= scalarizeLoadExtract(I);
-      MadeChange |= scalarizeExtExtract(I);
-      MadeChange |= scalarizeVPIntrinsic(I);
-      MadeChange |= foldInterleaveIntrinsics(I);
+      if (scalarizeOpOrCmp(I))
+        return true;
+      if (scalarizeLoadExtract(I))
+        return true;
+      if (scalarizeExtExtract(I))
+        return true;
+      if (scalarizeVPIntrinsic(I))
+        return true;
+      if (foldInterleaveIntrinsics(I))
+        return true;
     }
 
     if (Opcode == Instruction::Store)
-      MadeChange |= foldSingleElementStore(I);
+      if (foldSingleElementStore(I))
+        return true;
 
     // If this is an early pipeline invocation of this pass, we are done.
     if (TryEarlyFoldsOnly)
-      return;
+      return false;
 
     // Otherwise, try folds that improve codegen but may interfere with
     // early IR canonicalizations.
@@ -3923,56 +4168,87 @@ bool VectorCombine::run() {
     if (IsFixedVectorType) {
       switch (Opcode) {
       case Instruction::InsertElement:
-        MadeChange |= foldInsExtFNeg(I);
-        MadeChange |= foldInsExtBinop(I);
-        MadeChange |= foldInsExtVectorToShuffle(I);
+        if (foldInsExtFNeg(I))
+          return true;
+        if (foldInsExtBinop(I))
+          return true;
+        if (foldInsExtVectorToShuffle(I))
+          return true;
         break;
       case Instruction::ShuffleVector:
-        MadeChange |= foldPermuteOfBinops(I);
-        MadeChange |= foldShuffleOfBinops(I);
-        MadeChange |= foldShuffleOfSelects(I);
-        MadeChange |= foldShuffleOfCastops(I);
-        MadeChange |= foldShuffleOfShuffles(I);
-        MadeChange |= foldShuffleOfIntrinsics(I);
-        MadeChange |= foldSelectShuffle(I);
-        MadeChange |= foldShuffleToIdentity(I);
+        if (foldPermuteOfBinops(I))
+          return true;
+        if (foldShuffleOfBinops(I))
+          return true;
+        if (foldShuffleOfSelects(I))
+          return true;
+        if (foldShuffleOfCastops(I))
+          return true;
+        if (foldShuffleOfShuffles(I))
+          return true;
+        if (foldShuffleOfIntrinsics(I))
+          return true;
+        if (foldSelectShuffle(I))
+          return true;
+        if (foldShuffleToIdentity(I))
+          return true;
+        break;
+      case Instruction::Load:
+        if (shrinkLoadForShuffles(I))
+          return true;
         break;
       case Instruction::BitCast:
-        MadeChange |= foldBitcastShuffle(I);
+        if (foldBitcastShuffle(I))
+          return true;
         break;
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
-        MadeChange |= foldBitOpOfCastops(I);
+        if (foldBitOpOfCastops(I))
+          return true;
+        break;
+      case Instruction::PHI:
+        if (shrinkPhiOfShuffles(I))
+          return true;
         break;
       default:
-        MadeChange |= shrinkType(I);
+        if (shrinkType(I))
+          return true;
         break;
       }
     } else {
       switch (Opcode) {
       case Instruction::Call:
-        MadeChange |= foldShuffleFromReductions(I);
-        MadeChange |= foldCastFromReductions(I);
+        if (foldShuffleFromReductions(I))
+          return true;
+        if (foldCastFromReductions(I))
+          return true;
         break;
       case Instruction::ICmp:
       case Instruction::FCmp:
-        MadeChange |= foldExtractExtract(I);
+        if (foldExtractExtract(I))
+          return true;
         break;
       case Instruction::Or:
-        MadeChange |= foldConcatOfBoolMasks(I);
+        if (foldConcatOfBoolMasks(I))
+          return true;
         [[fallthrough]];
       default:
         if (Instruction::isBinaryOp(Opcode)) {
-          MadeChange |= foldExtractExtract(I);
-          MadeChange |= foldExtractedCmps(I);
-          MadeChange |= foldBinopOfReductions(I);
+          if (foldExtractExtract(I))
+            return true;
+          if (foldExtractedCmps(I))
+            return true;
+          if (foldBinopOfReductions(I))
+            return true;
         }
         break;
       }
     }
+    return false;
   };
 
+  bool MadeChange = false;
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
     if (!DT.isReachableFromEntry(&BB))
@@ -3981,7 +4257,7 @@ bool VectorCombine::run() {
     for (Instruction &I : make_early_inc_range(BB)) {
       if (I.isDebugOrPseudoInst())
         continue;
-      FoldInst(I);
+      MadeChange |= FoldInst(I);
     }
   }
 
@@ -3995,7 +4271,7 @@ bool VectorCombine::run() {
       continue;
     }
 
-    FoldInst(*I);
+    MadeChange |= FoldInst(*I);
   }
 
   return MadeChange;