75 files changed, 1032 insertions, 531 deletions
diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp
index a363bce..c4abec0 100644
--- a/llvm/lib/Analysis/CtxProfAnalysis.cpp
+++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp
@@ -30,6 +30,9 @@
 #define DEBUG_TYPE "ctx_prof"
 
 using namespace llvm;
+
+namespace llvm {
+
 cl::opt<std::string>
     UseCtxProfile("use-ctx-profile", cl::init(""), cl::Hidden,
                   cl::desc("Use the specified contextual profile file"));
@@ -50,7 +53,6 @@ static cl::opt<bool> ForceIsInSpecializedModule(
 
 const char *AssignGUIDPass::GUIDMetadataName = "guid";
 
-namespace llvm {
 class ProfileAnnotatorImpl final {
   friend class ProfileAnnotator;
   class BBInfo;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 99afc06..af30422 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Module.h"
@@ -216,6 +217,8 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
       ArgEmb += Vocab[*Op];
     auto InstVector =
         Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+    if (const auto *IC = dyn_cast<CmpInst>(&I))
+      InstVector += Vocab[IC->getPredicate()];
     InstVecMap[&I] = InstVector;
     BBVector += InstVector;
   }
@@ -250,6 +253,9 @@ void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
     // embeddings
     auto InstVector =
         Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+    // Add compare predicate embedding as an additional operand if applicable
+    if (const auto *IC = dyn_cast<CmpInst>(&I))
+      InstVector += Vocab[IC->getPredicate()];
     InstVecMap[&I] = InstVector;
     BBVector += InstVector;
   }
@@ -257,41 +263,76 @@ void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
 }
 
 // ==----------------------------------------------------------------------===//
-// Vocabulary
+// VocabStorage
 //===----------------------------------------------------------------------===//
 
-unsigned Vocabulary::getDimension() const {
-  assert(isValid() && "IR2Vec Vocabulary is invalid");
-  return Vocab[0].size();
-}
-
-unsigned Vocabulary::getSlotIndex(unsigned Opcode) {
-  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
-  return Opcode - 1; // Convert to zero-based index
-}
-
-unsigned Vocabulary::getSlotIndex(Type::TypeID TypeID) {
-  assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
-  return MaxOpcodes + static_cast<unsigned>(getCanonicalTypeID(TypeID));
-}
-
-unsigned Vocabulary::getSlotIndex(const Value &Op) {
-  unsigned Index = static_cast<unsigned>(getOperandKind(&Op));
-  assert(Index < MaxOperandKinds && "Invalid OperandKind");
-  return MaxOpcodes + MaxCanonicalTypeIDs + Index;
+VocabStorage::VocabStorage(std::vector<std::vector<Embedding>> &&SectionData)
+    : Sections(std::move(SectionData)), TotalSize([&] {
+        assert(!Sections.empty() && "Vocabulary has no sections");
+        // Compute total size across all sections
+        size_t Size = 0;
+        for (const auto &Section : Sections) {
+          assert(!Section.empty() && "Vocabulary section is empty");
+          Size += Section.size();
+        }
+        return Size;
+      }()),
+      Dimension([&] {
+        // Get dimension from the first embedding in the first section - all
+        // embeddings must have the same dimension
+        assert(!Sections.empty() && "Vocabulary has no sections");
+        assert(!Sections[0].empty() && "First section of vocabulary is empty");
+        unsigned ExpectedDim = static_cast<unsigned>(Sections[0][0].size());
+
+        // Verify that all embeddings across all sections have the same
+        // dimension
+        [[maybe_unused]] auto allSameDim =
+            [ExpectedDim](const std::vector<Embedding> &Section) {
+              return std::all_of(Section.begin(), Section.end(),
+                                 [ExpectedDim](const Embedding &Emb) {
+                                   return Emb.size() == ExpectedDim;
+                                 });
+            };
+        assert(std::all_of(Sections.begin(), Sections.end(), allSameDim) &&
+               "All embeddings must have the same dimension");
+
+        return ExpectedDim;
+      }()) {}
+
+const Embedding &VocabStorage::const_iterator::operator*() const {
+  assert(SectionId < Storage->Sections.size() && "Invalid section ID");
+  assert(LocalIndex < Storage->Sections[SectionId].size() &&
+         "Local index out of range");
+  return Storage->Sections[SectionId][LocalIndex];
+}
+
+VocabStorage::const_iterator &VocabStorage::const_iterator::operator++() {
+  ++LocalIndex;
+  // Check if we need to move to the next section
+  if (SectionId < Storage->getNumSections() &&
+      LocalIndex >= Storage->Sections[SectionId].size()) {
+    assert(LocalIndex == Storage->Sections[SectionId].size() &&
+           "Local index should be at the end of the current section");
+    LocalIndex = 0;
+    ++SectionId;
+  }
+  return *this;
 }
 
-const Embedding &Vocabulary::operator[](unsigned Opcode) const {
-  return Vocab[getSlotIndex(Opcode)];
+bool VocabStorage::const_iterator::operator==(
+    const const_iterator &Other) const {
+  return Storage == Other.Storage && SectionId == Other.SectionId &&
+         LocalIndex == Other.LocalIndex;
 }
 
-const Embedding &Vocabulary::operator[](Type::TypeID TypeID) const {
-  return Vocab[getSlotIndex(TypeID)];
+bool VocabStorage::const_iterator::operator!=(
+    const const_iterator &Other) const {
+  return !(*this == Other);
 }
 
-const ir2vec::Embedding &Vocabulary::operator[](const Value &Arg) const {
-  return Vocab[getSlotIndex(Arg)];
-}
+// ==----------------------------------------------------------------------===//
+// Vocabulary
+//===----------------------------------------------------------------------===//
 
 StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
   assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
@@ -304,29 +345,6 @@ StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
   return "UnknownOpcode";
 }
 
-StringRef Vocabulary::getVocabKeyForCanonicalTypeID(CanonicalTypeID CType) {
-  unsigned Index = static_cast<unsigned>(CType);
-  assert(Index < MaxCanonicalTypeIDs && "Invalid CanonicalTypeID");
-  return CanonicalTypeNames[Index];
-}
-
-Vocabulary::CanonicalTypeID
-Vocabulary::getCanonicalTypeID(Type::TypeID TypeID) {
-  unsigned Index = static_cast<unsigned>(TypeID);
-  assert(Index < MaxTypeIDs && "Invalid TypeID");
-  return TypeIDMapping[Index];
-}
-
-StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) {
-  return getVocabKeyForCanonicalTypeID(getCanonicalTypeID(TypeID));
-}
-
-StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) {
-  unsigned Index = static_cast<unsigned>(Kind);
-  assert(Index < MaxOperandKinds && "Invalid OperandKind");
-  return OperandKindNames[Index];
-}
-
 // Helper function to classify an operand into OperandKind
 Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
   if (isa<Function>(Op))
@@ -338,18 +356,50 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
   return OperandKind::VariableID;
 }
 
+unsigned Vocabulary::getPredicateLocalIndex(CmpInst::Predicate P) {
+  if (P >= CmpInst::FIRST_FCMP_PREDICATE && P <= CmpInst::LAST_FCMP_PREDICATE)
+    return P - CmpInst::FIRST_FCMP_PREDICATE;
+  else
+    return P - CmpInst::FIRST_ICMP_PREDICATE +
+           (CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + 1);
+}
+
+CmpInst::Predicate Vocabulary::getPredicateFromLocalIndex(unsigned LocalIndex) {
+  unsigned fcmpRange =
+      CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + 1;
+  if (LocalIndex < fcmpRange)
+    return static_cast<CmpInst::Predicate>(CmpInst::FIRST_FCMP_PREDICATE +
+                                           LocalIndex);
+  else
+    return static_cast<CmpInst::Predicate>(CmpInst::FIRST_ICMP_PREDICATE +
+                                           LocalIndex - fcmpRange);
+}
+
+StringRef Vocabulary::getVocabKeyForPredicate(CmpInst::Predicate Pred) {
+  static SmallString<16> PredNameBuffer;
+  if (Pred < CmpInst::FIRST_ICMP_PREDICATE)
+    PredNameBuffer = "FCMP_";
+  else
+    PredNameBuffer = "ICMP_";
+  PredNameBuffer += CmpInst::getPredicateName(Pred);
+  return PredNameBuffer;
+}
+
 StringRef Vocabulary::getStringKey(unsigned Pos) {
   assert(Pos < NumCanonicalEntries && "Position out of bounds in vocabulary");
   // Opcode
   if (Pos < MaxOpcodes)
     return getVocabKeyForOpcode(Pos + 1);
   // Type
-  if (Pos < MaxOpcodes + MaxCanonicalTypeIDs)
+  if (Pos < OperandBaseOffset)
     return getVocabKeyForCanonicalTypeID(
         static_cast<CanonicalTypeID>(Pos - MaxOpcodes));
   // Operand
-  return getVocabKeyForOperandKind(
-      static_cast<OperandKind>(Pos - MaxOpcodes - MaxCanonicalTypeIDs));
+  if (Pos < PredicateBaseOffset)
+    return getVocabKeyForOperandKind(
+        static_cast<OperandKind>(Pos - OperandBaseOffset));
+  // Predicates
+  return getVocabKeyForPredicate(getPredicate(Pos - PredicateBaseOffset));
 }
 
 // For now, assume vocabulary is stable unless explicitly invalidated.
@@ -359,19 +409,51 @@ bool Vocabulary::invalidate(Module &M, const PreservedAnalyses &PA,
   return !(PAC.preservedWhenStateless());
 }
 
-Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) {
-  VocabVector DummyVocab;
-  DummyVocab.reserve(NumCanonicalEntries);
+VocabStorage Vocabulary::createDummyVocabForTest(unsigned Dim) {
   float DummyVal = 0.1f;
-  // Create a dummy vocabulary with entries for all opcodes, types, and
-  // operands
-  for ([[maybe_unused]] unsigned _ :
-       seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxCanonicalTypeIDs +
-                   Vocabulary::MaxOperandKinds)) {
-    DummyVocab.push_back(Embedding(Dim, DummyVal));
+
+  // Create sections for opcodes, types, operands, and predicates
+  // Order must match Vocabulary::Section enum
+  std::vector<std::vector<Embedding>> Sections;
+  Sections.reserve(4);
+
+  // Opcodes section
+  std::vector<Embedding> OpcodeSec;
+  OpcodeSec.reserve(MaxOpcodes);
+  for (unsigned I = 0; I < MaxOpcodes; ++I) {
+    OpcodeSec.emplace_back(Dim, DummyVal);
     DummyVal += 0.1f;
   }
-  return DummyVocab;
+  Sections.push_back(std::move(OpcodeSec));
+
+  // Types section
+  std::vector<Embedding> TypeSec;
+  TypeSec.reserve(MaxCanonicalTypeIDs);
+  for (unsigned I = 0; I < MaxCanonicalTypeIDs; ++I) {
+    TypeSec.emplace_back(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+  Sections.push_back(std::move(TypeSec));
+
+  // Operands section
+  std::vector<Embedding> OperandSec;
+  OperandSec.reserve(MaxOperandKinds);
+  for (unsigned I = 0; I < MaxOperandKinds; ++I) {
+    OperandSec.emplace_back(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+  Sections.push_back(std::move(OperandSec));
+
+  // Predicates section
+  std::vector<Embedding> PredicateSec;
+  PredicateSec.reserve(MaxPredicateKinds);
+  for (unsigned I = 0; I < MaxPredicateKinds; ++I) {
+    PredicateSec.emplace_back(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+  Sections.push_back(std::move(PredicateSec));
+
+  return VocabStorage(std::move(Sections));
 }
 
 // ==----------------------------------------------------------------------===//
@@ -417,7 +499,9 @@ Error IR2VecVocabAnalysis::parseVocabSection(
 
 // FIXME: Make this optional. We can avoid file reads
 // by auto-generating a default vocabulary during the build time.
-Error IR2VecVocabAnalysis::readVocabulary() {
+Error IR2VecVocabAnalysis::readVocabulary(VocabMap &OpcVocab,
+                                          VocabMap &TypeVocab,
+                                          VocabMap &ArgVocab) {
   auto BufOrError = MemoryBuffer::getFileOrSTDIN(VocabFile, /*IsText=*/true);
   if (!BufOrError)
     return createFileError(VocabFile, BufOrError.getError());
@@ -448,7 +532,9 @@ Error IR2VecVocabAnalysis::readVocabulary() {
   return Error::success();
 }
 
-void IR2VecVocabAnalysis::generateNumMappedVocab() {
+void IR2VecVocabAnalysis::generateVocabStorage(VocabMap &OpcVocab,
+                                               VocabMap &TypeVocab,
+                                               VocabMap &ArgVocab) {
 
   // Helper for handling missing entities in the vocabulary.
   // Currently, we use a zero vector. In the future, we will throw an error to
@@ -466,7 +552,6 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
   // Handle Opcodes
   std::vector<Embedding> NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes,
                                                  Embedding(Dim));
-  NumericOpcodeEmbeddings.reserve(Vocabulary::MaxOpcodes);
   for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) {
     StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode + 1);
     auto It = OpcVocab.find(VocabKey.str());
@@ -475,13 +560,10 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
     else
       handleMissingEntity(VocabKey.str());
   }
-  Vocab.insert(Vocab.end(), NumericOpcodeEmbeddings.begin(),
-               NumericOpcodeEmbeddings.end());
 
   // Handle Types - only canonical types are present in vocabulary
   std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxCanonicalTypeIDs,
                                                Embedding(Dim));
-  NumericTypeEmbeddings.reserve(Vocabulary::MaxCanonicalTypeIDs);
   for (unsigned CTypeID : seq(0u, Vocabulary::MaxCanonicalTypeIDs)) {
     StringRef VocabKey = Vocabulary::getVocabKeyForCanonicalTypeID(
         static_cast<Vocabulary::CanonicalTypeID>(CTypeID));
@@ -491,13 +573,10 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
     }
     handleMissingEntity(VocabKey.str());
   }
-  Vocab.insert(Vocab.end(), NumericTypeEmbeddings.begin(),
-               NumericTypeEmbeddings.end());
 
   // Handle Arguments/Operands
   std::vector<Embedding> NumericArgEmbeddings(Vocabulary::MaxOperandKinds,
                                               Embedding(Dim));
-  NumericArgEmbeddings.reserve(Vocabulary::MaxOperandKinds);
   for (unsigned OpKind : seq(0u, Vocabulary::MaxOperandKinds)) {
     Vocabulary::OperandKind Kind = static_cast<Vocabulary::OperandKind>(OpKind);
     StringRef VocabKey = Vocabulary::getVocabKeyForOperandKind(Kind);
@@ -508,15 +587,37 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
     }
     handleMissingEntity(VocabKey.str());
   }
-  Vocab.insert(Vocab.end(), NumericArgEmbeddings.begin(),
-               NumericArgEmbeddings.end());
-}
 
-IR2VecVocabAnalysis::IR2VecVocabAnalysis(const VocabVector &Vocab)
-    : Vocab(Vocab) {}
+  // Handle Predicates: part of Operands section. We look up predicate keys
+  // in ArgVocab.
+  std::vector<Embedding> NumericPredEmbeddings(Vocabulary::MaxPredicateKinds,
+                                               Embedding(Dim, 0));
+  for (unsigned PK : seq(0u, Vocabulary::MaxPredicateKinds)) {
+    StringRef VocabKey =
+        Vocabulary::getVocabKeyForPredicate(Vocabulary::getPredicate(PK));
+    auto It = ArgVocab.find(VocabKey.str());
+    if (It != ArgVocab.end()) {
+      NumericPredEmbeddings[PK] = It->second;
+      continue;
+    }
+    handleMissingEntity(VocabKey.str());
+  }
+
+  // Create section-based storage instead of flat vocabulary
+  // Order must match Vocabulary::Section enum
+  std::vector<std::vector<Embedding>> Sections(4);
+  Sections[static_cast<unsigned>(Vocabulary::Section::Opcodes)] =
+      std::move(NumericOpcodeEmbeddings); // Section::Opcodes
+  Sections[static_cast<unsigned>(Vocabulary::Section::CanonicalTypes)] =
+      std::move(NumericTypeEmbeddings); // Section::CanonicalTypes
+  Sections[static_cast<unsigned>(Vocabulary::Section::Operands)] =
+      std::move(NumericArgEmbeddings); // Section::Operands
+  Sections[static_cast<unsigned>(Vocabulary::Section::Predicates)] =
+      std::move(NumericPredEmbeddings); // Section::Predicates
 
-IR2VecVocabAnalysis::IR2VecVocabAnalysis(VocabVector &&Vocab)
-    : Vocab(std::move(Vocab)) {}
+  // Create VocabStorage from organized sections
+  Vocab.emplace(std::move(Sections));
+}
 
 void IR2VecVocabAnalysis::emitError(Error Err, LLVMContext &Ctx) {
   handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
@@ -528,8 +629,8 @@ IR2VecVocabAnalysis::Result
 IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   auto Ctx = &M.getContext();
   // If vocabulary is already populated by the constructor, use it.
-  if (!Vocab.empty())
-    return Vocabulary(std::move(Vocab));
+  if (Vocab.has_value())
+    return Vocabulary(std::move(Vocab.value()));
 
   // Otherwise, try to read from the vocabulary file.
   if (VocabFile.empty()) {
@@ -538,7 +639,9 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
                    "set it using --ir2vec-vocab-path");
     return Vocabulary(); // Return invalid result
   }
-  if (auto Err = readVocabulary()) {
+
+  VocabMap OpcVocab, TypeVocab, ArgVocab;
+  if (auto Err = readVocabulary(OpcVocab, TypeVocab, ArgVocab)) {
     emitError(std::move(Err), *Ctx);
     return Vocabulary();
   }
@@ -553,9 +656,9 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   scaleVocabSection(ArgVocab, ArgWeight);
 
   // Generate the numeric lookup vocabulary
-  generateNumMappedVocab();
+  generateVocabStorage(OpcVocab, TypeVocab, ArgVocab);
 
-  return Vocabulary(std::move(Vocab));
+  return Vocabulary(std::move(Vocab.value()));
 }
 
 // ==----------------------------------------------------------------------===//
@@ -564,7 +667,7 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
 
 PreservedAnalyses IR2VecPrinterPass::run(Module &M,
                                          ModuleAnalysisManager &MAM) {
-  auto Vocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
+  auto &Vocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
   assert(Vocabulary.isValid() && "IR2Vec Vocabulary is invalid");
 
   for (Function &F : M) {
@@ -606,7 +709,7 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
 
 PreservedAnalyses IR2VecVocabPrinterPass::run(Module &M,
                                               ModuleAnalysisManager &MAM) {
-  auto IR2VecVocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
+  auto &IR2VecVocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
   assert(IR2VecVocabulary.isValid() && "IR2Vec Vocabulary is invalid");
 
   // Print each entry
diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index 7b93474..25e7a97 100644
--- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -22,6 +22,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "pgo-icall-prom-analysis"
 
+namespace llvm {
+
 // The percent threshold for the direct-call target (this call site vs the
 // remaining call count) for it to be considered as the promotion target.
 static cl::opt<unsigned> ICPRemainingPercentThreshold(
@@ -54,6 +56,8 @@ cl::opt<unsigned> MaxNumVTableAnnotations(
     "icp-max-num-vtables", cl::init(6), cl::Hidden,
     cl::desc("Max number of vtables annotated for a vtable load instruction."));
 
+} // end namespace llvm
+
 bool ICallPromotionAnalysis::isPromotionProfitable(uint64_t Count,
                                                    uint64_t TotalCount,
                                                    uint64_t RemainingCount) {
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index 28b14c2..0fa804f 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -217,7 +217,7 @@ AnalysisKey PluginInlineAdvisorAnalysis::Key;
 bool InlineAdvisorAnalysis::initializeIR2VecVocabIfRequested(
     Module &M, ModuleAnalysisManager &MAM) {
   if (!IR2VecVocabFile.empty()) {
-    auto IR2VecVocabResult = MAM.getResult<IR2VecVocabAnalysis>(M);
+    auto &IR2VecVocabResult = MAM.getResult<IR2VecVocabAnalysis>(M);
     if (!IR2VecVocabResult.isValid()) {
       M.getContext().emitError("Failed to load IR2Vec vocabulary");
       return false;
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index b5ca6b1..11602d2 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -22,6 +22,8 @@ using namespace llvm::memprof;
 
 #define DEBUG_TYPE "memory-profile-info"
 
+namespace llvm {
+
 cl::opt<bool> MemProfReportHintedSizes(
     "memprof-report-hinted-sizes", cl::init(false), cl::Hidden,
     cl::desc("Report total allocation sizes of hinted allocations"));
@@ -52,6 +54,8 @@ cl::opt<unsigned> MinPercentMaxColdSize(
     "memprof-min-percent-max-cold-size", cl::init(100), cl::Hidden,
     cl::desc("Min percent of max cold bytes for critical cold context"));
 
+} // end namespace llvm
+
 bool llvm::memprof::metadataIncludesAllContextSizeInfo() {
   return MemProfReportHintedSizes || MinClonedColdBytePercent < 100;
 }
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index a317ac4..a60a4bb 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -67,7 +67,6 @@ using namespace llvm::memprof;
 namespace llvm {
 FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold =
     FunctionSummary::FSHT_None;
-} // namespace llvm
 
 static cl::opt<FunctionSummary::ForceSummaryHotnessType, true> FSEC(
     "force-summary-edges-cold", cl::Hidden, cl::location(ForceSummaryEdgesCold),
@@ -91,6 +90,7 @@ LLVM_ABI extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
 
 extern cl::opt<bool> MemProfReportHintedSizes;
+} // namespace llvm
 
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index f1c3155..44d7a17 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -24,6 +24,8 @@
 #include <optional>
 using namespace llvm;
 
+namespace llvm {
+
 static cl::opt<bool> PartialProfile(
     "partial-profile", cl::Hidden, cl::init(false),
     cl::desc("Specify the current profile is used as a partial profile."));
@@ -44,6 +46,8 @@ static cl::opt<double> PartialSampleProfileWorkingSetSizeScaleFactor(
              "and the factor to scale the working set size to use the same "
              "shared thresholds as PGO."));
 
+} // end namespace llvm
+
 // The profile summary metadata may be attached either by the frontend or by
 // any backend passes (IR level instrumentation, for example). This method
 // checks if the Summary is null and if so checks if the summary metadata is now
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index b4f08c3..7900dc7 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -31,11 +31,14 @@ static cl::opt<bool>
 static cl::opt<std::string>
     CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
                        cl::desc("File path to where .cgdata file is read"));
+
+namespace llvm {
 cl::opt<bool> CodeGenDataThinLTOTwoRounds(
     "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden,
     cl::desc("Enable two-round ThinLTO code generation. The first round "
              "emits codegen data, while the second round uses the emitted "
              "codegen data for further optimizations."));
+} // end namespace llvm
 
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index 3fd8cfe..b1cd939 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -26,14 +26,14 @@ static cl::opt<bool> IndexedCodeGenDataReadFunctionMapNames(
              "disabled to save memory and time for final consumption of the "
              "indexed CodeGenData in production."));
 
+namespace llvm {
+
 cl::opt<bool> IndexedCodeGenDataLazyLoading(
     "indexed-codegen-data-lazy-loading", cl::init(false), cl::Hidden,
     cl::desc(
         "Lazily load indexed CodeGenData. Enable to save memory and time "
         "for final consumption of the indexed CodeGenData in production."));
 
-namespace llvm {
-
 static Expected<std::unique_ptr<MemoryBuffer>>
 setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) {
   auto BufferOrErr = Filename.str() == "-" ? MemoryBuffer::getSTDIN()
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 56e13f0..884c3f1 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2362,6 +2362,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                            MachineInstr::copyFlagsFromInstruction(CI));
     return true;
   }
+  case Intrinsic::modf: {
+    ArrayRef<Register> VRegs = getOrCreateVRegs(CI);
+    MIRBuilder.buildModf(VRegs[0], VRegs[1],
+                         getOrCreateVReg(*CI.getArgOperand(0)),
+                         MachineInstr::copyFlagsFromInstruction(CI));
+    return true;
+  }
   case Intrinsic::sincos: {
     ArrayRef<Register> VRegs = getOrCreateVRegs(CI);
     MIRBuilder.buildFSincos(VRegs[0], VRegs[1],
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 03dfa6f..cffaf7c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -471,6 +471,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     RTLIBCASE(TANH_F);
   case TargetOpcode::G_FSINCOS:
     RTLIBCASE(SINCOS_F);
+  case TargetOpcode::G_FMODF:
+    RTLIBCASE(MODF_F);
   case TargetOpcode::G_FLOG10:
     RTLIBCASE(LOG10_F);
   case TargetOpcode::G_FLOG:
@@ -703,6 +705,46 @@ LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall(
 }
 
 LegalizerHelper::LegalizeResult
+LegalizerHelper::emitModfLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+                                 unsigned Size, Type *OpType,
+                                 LostDebugLocObserver &LocObserver) {
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  Register DstFrac = MI.getOperand(0).getReg();
+  Register DstInt = MI.getOperand(1).getReg();
+  Register Src = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(DstFrac);
+
+  int MemSize = DstTy.getSizeInBytes();
+  Align Alignment = getStackTemporaryAlignment(DstTy);
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  unsigned AddrSpace = DL.getAllocaAddrSpace();
+  MachinePointerInfo PtrInfo;
+
+  Register StackPtrInt =
+      createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo)
+          .getReg(0);
+
+  auto &Ctx = MF.getFunction().getContext();
+  auto LibcallResult = createLibcall(
+      MIRBuilder, getRTLibDesc(MI.getOpcode(), Size), {DstFrac, OpType, 0},
+      {{Src, OpType, 0}, {StackPtrInt, PointerType::get(Ctx, AddrSpace), 1}},
+      LocObserver, &MI);
+
+  if (LibcallResult != LegalizeResult::Legalized)
+    return LegalizerHelper::UnableToLegalize;
+
+  MachineMemOperand *LoadMMOInt = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment);
+
+  MIRBuilder.buildLoad(DstInt, StackPtrInt, *LoadMMOInt);
+  MI.eraseFromParent();
+
+  return LegalizerHelper::Legalized;
+}
+
+LegalizerHelper::LegalizeResult
 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
@@ -1341,6 +1383,16 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
     }
     return emitSincosLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
   }
+  case TargetOpcode::G_FMODF: {
+    LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
+    unsigned Size = LLTy.getSizeInBits();
+    Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
+    if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
+      LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
+      return UnableToLegalize;
+    }
+    return emitModfLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
+  }
   case TargetOpcode::G_LROUND:
   case TargetOpcode::G_LLROUND:
   case TargetOpcode::G_INTRINSIC_LRINT:
@@ -3333,6 +3385,16 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
     Observer.changedInstr(MI);
     return Legalized;
+  case TargetOpcode::G_FMODF: {
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
+
+    widenScalarDst(MI, WideTy, 1, TargetOpcode::G_FPTRUNC);
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), --MIRBuilder.getInsertPt());
+    widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   case TargetOpcode::G_FPOWI:
   case TargetOpcode::G_FLDEXP:
   case TargetOpcode::G_STRICT_FLDEXP: {
@@ -5472,6 +5534,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_LROUND:
   case G_LLROUND:
   case G_INTRINSIC_TRUNC:
+  case G_FMODF:
   case G_FCOS:
   case G_FSIN:
   case G_FTAN:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 477e5c1..c2d474f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -34,7 +34,7 @@ cl::opt<bool> llvm::DisableGISelLegalityCheck(
     cl::desc("Don't verify that MIR is fully legal between GlobalISel passes"),
     cl::Hidden);
 
-cl::opt<bool> VerboseVerifyLegalizerInfo(
+static cl::opt<bool> VerboseVerifyLegalizerInfo(
     "verbose-gisel-verify-legalizer-info",
     cl::desc("Print more information to dbgs about GlobalISel legalizer rules "
              "being verified"),
diff --git a/llvm/lib/CodeGen/MachineRegionInfo.cpp b/llvm/lib/CodeGen/MachineRegionInfo.cpp
index f8268b8..366755a 100644
--- a/llvm/lib/CodeGen/MachineRegionInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegionInfo.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -127,7 +128,7 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {
 #endif
 
 char MachineRegionInfoPass::ID = 0;
-char &MachineRegionInfoPassID = MachineRegionInfoPass::ID;
+char &llvm::MachineRegionInfoPassID = MachineRegionInfoPass::ID;
 
 INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE,
                       "Detect single entry single exit regions", true, true)
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 729a57e..e1d39d6 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1929,7 +1929,27 @@ ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
   const MachineOperand &Src = Def->getOperand(1);
   if (Src.isUndef())
     return ValueTrackerResult();
-  return ValueTrackerResult(Src.getReg(), Src.getSubReg());
+
+  Register SrcReg = Src.getReg();
+  unsigned SubReg = Src.getSubReg();
+  if (DefSubReg) {
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+    SubReg = TRI->composeSubRegIndices(SubReg, DefSubReg);
+
+    if (SrcReg.isVirtual()) {
+      // TODO: Try constraining on rewrite if we can
+      const TargetRegisterClass *RegRC = MRI.getRegClass(SrcReg);
+      const TargetRegisterClass *SrcWithSubRC =
+          TRI->getSubClassWithSubReg(RegRC, SubReg);
+      if (RegRC != SrcWithSubRC)
+        return ValueTrackerResult();
+    } else {
+      if (!TRI->getSubReg(SrcReg, SubReg))
+        return ValueTrackerResult();
+    }
+  }
+
+  return ValueTrackerResult(SrcReg, SubReg);
 }
 
 ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 8e6cf3e..7fe13a3 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1406,8 +1406,24 @@ bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint,
       continue;
 
     // Check if VirtReg interferes with OtherReg after this COPY instruction.
-    if (!IsDef && VirtReg.liveAt(LIS->getInstructionIndex(Instr).getRegSlot()))
-      continue;
+    if (Opnd.readsReg()) {
+      SlotIndex Index = LIS->getInstructionIndex(Instr).getRegSlot();
+
+      if (SubReg) {
+        LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
+        if (IsDef)
+          Mask = ~Mask;
+
+        if (any_of(VirtReg.subranges(), [=](const LiveInterval::SubRange &S) {
+              return (S.LaneMask & Mask).any() && S.liveAt(Index);
+            })) {
+          continue;
+        }
+      } else {
+        if (VirtReg.liveAt(Index))
+          continue;
+      }
+    }
 
     MCRegister OtherPhysReg =
         OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
@@ -2419,25 +2435,28 @@ void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
     unsigned SubReg = Opnd.getSubReg();
 
     // Get the current assignment.
-    MCRegister OtherPhysReg =
-        OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
-    if (OtherSubReg) {
-      if (OtherReg.isPhysical()) {
-        MCRegister Tuple =
-            TRI->getMatchingSuperReg(OtherPhysReg, OtherSubReg, RC);
-        if (!Tuple)
-          continue;
-        OtherPhysReg = Tuple;
-      } else {
-        // TODO: There should be a hinting mechanism for subregisters
-        if (SubReg != OtherSubReg)
-          continue;
-      }
+    MCRegister OtherPhysReg;
+    if (OtherReg.isPhysical()) {
+      if (OtherSubReg)
+        OtherPhysReg = TRI->getMatchingSuperReg(OtherReg, OtherSubReg, RC);
+      else if (SubReg)
+        OtherPhysReg = TRI->getMatchingSuperReg(OtherReg, SubReg, RC);
+      else
+        OtherPhysReg = OtherReg;
+    } else {
+      OtherPhysReg = VRM->getPhys(OtherReg);
+      // TODO: Should find matching superregister, but applying this in the
+      // non-hint case currently causes regressions
+
+      if (SubReg && OtherSubReg && SubReg != OtherSubReg)
+        continue;
     }
 
     // Push the collected information.
-    Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
-                           OtherPhysReg));
+    if (OtherPhysReg) {
+      Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
+                             OtherPhysReg));
+    }
   }
 }
 
@@ -2466,15 +2485,13 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
   // We have a broken hint, check if it is possible to fix it by
   // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
   // some register and PhysReg may be available for the other live-ranges.
-  SmallSet<Register, 4> Visited;
-  SmallVector<Register, 2> RecoloringCandidates;
   HintsInfo Info;
   Register Reg = VirtReg.reg();
   MCRegister PhysReg = VRM->getPhys(Reg);
   // Start the recoloring algorithm from the input live-interval, then
   // it will propagate to the ones that are copy-related with it.
-  Visited.insert(Reg);
-  RecoloringCandidates.push_back(Reg);
+  SmallSet<Register, 4> Visited = {Reg};
+  SmallVector<Register, 2> RecoloringCandidates = {Reg};
 
   LLVM_DEBUG(dbgs() << "Trying to reconcile hints for: " << printReg(Reg, TRI)
                     << '(' << printReg(PhysReg, TRI) << ")\n");
@@ -2482,12 +2499,10 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
   do {
     Reg = RecoloringCandidates.pop_back_val();
 
-    // We cannot recolor physical register.
-    if (Reg.isPhysical())
-      continue;
+    MCRegister CurrPhys = VRM->getPhys(Reg);
 
     // This may be a skipped register.
-    if (!VRM->hasPhys(Reg)) {
+    if (!CurrPhys) {
       assert(!shouldAllocateRegister(Reg) &&
              "We have an unallocated variable which should have been handled");
       continue;
@@ -2496,7 +2511,6 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
     // Get the live interval mapped with this virtual register to be able
     // to check for the interference with the new color.
     LiveInterval &LI = LIS->getInterval(Reg);
-    MCRegister CurrPhys = VRM->getPhys(Reg);
     // Check that the new color matches the register class constraints and
     // that it is free for this live range.
     if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
@@ -2533,7 +2547,8 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
     // Push all copy-related live-ranges to keep reconciling the broken
     // hints.
     for (const HintInfo &HI : Info) {
-      if (Visited.insert(HI.Reg).second)
+      // We cannot recolor physical register.
+      if (HI.Reg.isVirtual() && Visited.insert(HI.Reg).second)
         RecoloringCandidates.push_back(HI.Reg);
     }
   } while (!RecoloringCandidates.empty());
diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp
index 9c9cc1f..280946b 100644
--- a/llvm/lib/CodeGen/RegAllocScore.cpp
+++ b/llvm/lib/CodeGen/RegAllocScore.cpp
@@ -23,6 +23,8 @@
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
+
+namespace llvm {
 LLVM_ABI cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2),
                                     cl::Hidden);
 LLVM_ABI cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0),
@@ -33,6 +35,8 @@ LLVM_ABI cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight",
                                           cl::init(0.2), cl::Hidden);
 LLVM_ABI cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight",
                                               cl::init(1.0), cl::Hidden);
+} // end namespace llvm
+
 #define DEBUG_TYPE "regalloc-score"
 
 RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) {
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 7ac1aef..ebfea8e 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -584,14 +584,14 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
       return DstReg == Dst;
     // This is a partial register copy. Check that the parts match.
     return Register(TRI.getSubReg(DstReg, SrcSub)) == Dst;
-  } else {
-    // DstReg is virtual.
-    if (DstReg != Dst)
-      return false;
-    // Registers match, do the subregisters line up?
-    return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
-           TRI.composeSubRegIndices(DstIdx, DstSub);
   }
+
+  // DstReg is virtual.
+  if (DstReg != Dst)
+    return false;
+  // Registers match, do the subregisters line up?
+  return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
+         TRI.composeSubRegIndices(DstIdx, DstSub);
 }
 
 void RegisterCoalescerLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -2914,8 +2914,7 @@ JoinVals::ConflictResolution JoinVals::analyzeValue(unsigned ValNo,
     if ((V.ValidLanes & OtherV.ValidLanes).any())
       // Overlapping lanes can't be resolved.
       return CR_Impossible;
-    else
-      return CR_Merge;
+    return CR_Merge;
   }
 
   // No simultaneous def. Is Other live at the def?
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index ff7cd66..87d5453 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6256,17 +6256,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   // FIXME: Not all targets may support EVL in VP_LOAD. These will have been
   // removed from the IR by the ExpandVectorPredication pass but we're
   // reintroducing them here.
-  EVT LdVT = LD->getMemoryVT();
-  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), LdVT);
-  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                    WideVT.getVectorElementCount());
+  EVT VT = LD->getValueType(0);
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  EVT WideMaskVT = getSetCCResultType(WideVT);
+
   if (ExtType == ISD::NON_EXTLOAD &&
       TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WideVT) &&
       TLI.isTypeLegal(WideMaskVT)) {
     SDLoc DL(N);
     SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
     SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(),
-                                      LdVT.getVectorElementCount());
+                                      VT.getVectorElementCount());
     SDValue NewLoad =
         DAG.getLoadVP(LD->getAddressingMode(), ISD::NON_EXTLOAD, WideVT, DL,
                       LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -6303,6 +6303,24 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
     return Result;
   }
 
+  if (VT.isVector()) {
+    // If all else fails replace the load with a wide masked load.
+    SDLoc DL(N);
+    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+
+    SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
+    SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
+                               DAG.getConstant(0, DL, IdxVT), Len);
+
+    SDValue NewLoad = DAG.getMaskedLoad(
+        WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
+        DAG.getPOISON(WideVT), LD->getMemoryVT(), LD->getMemOperand(),
+        LD->getAddressingMode(), LD->getExtensionType());
+
+    ReplaceValueWith(SDValue(N, 1), NewLoad.getValue(1));
+    return NewLoad;
+  }
+
   report_fatal_error("Unable to widen vector load");
 }
 
@@ -7516,8 +7534,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
   SDValue StVal = ST->getValue();
   EVT StVT = StVal.getValueType();
   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT);
-  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                    WideVT.getVectorElementCount());
+  EVT WideMaskVT = getSetCCResultType(WideVT);
 
   if (TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) &&
       TLI.isTypeLegal(WideMaskVT)) {
@@ -7540,6 +7557,22 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
     return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
   }
 
+  if (StVT.isVector()) {
+    // If all else fails replace the store with a wide masked store.
+    SDLoc DL(N);
+    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+
+    SDValue WideStVal = GetWidenedVector(StVal);
+    SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
+    SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
+                               DAG.getConstant(0, DL, IdxVT), Len);
+
+    return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
+                              ST->getOffset(), Mask, ST->getMemoryVT(),
+                              ST->getMemOperand(), ST->getAddressingMode(),
+                              ST->isTruncatingStore());
+  }
+
   report_fatal_error("Unable to widen vector store");
 }
 
@@ -8298,8 +8331,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   AAMDNodes AAInfo = LD->getAAInfo();
 
   if (LdVT.isScalableVector())
-    report_fatal_error("Generating widen scalable extending vector loads is "
-                       "not yet supported");
+    return SDValue();
 
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8fc7eab..95f53fe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4762,6 +4762,11 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::AssertZext:
     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
     return VTBits-Tmp;
+  case ISD::FREEZE:
+    if (isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedElts,
+                                         /*PoisonOnly=*/false))
+      return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    break;
   case ISD::MERGE_VALUES:
     return ComputeNumSignBits(Op.getOperand(Op.getResNo()), DemandedElts,
                               Depth + 1);
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 096a33c..ec75dc3 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -72,7 +72,7 @@ struct StackFrameLayoutAnalysis {
         : Slot(Idx), Size(MFI.getObjectSize(Idx)),
           Align(MFI.getObjectAlign(Idx).value()), Offset(Offset),
           SlotTy(Invalid), Scalable(false) {
-      Scalable = MFI.getStackID(Idx) == TargetStackID::ScalableVector;
+      Scalable = MFI.isScalableStackID(Idx);
       if (MFI.isSpillSlotObjectIndex(Idx))
         SlotTy = SlotType::Spill;
       else if (MFI.isFixedObjectIndex(Idx))
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp
index c1017d8..d973a47 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp
@@ -148,7 +148,7 @@ std::error_code LVSplitContext::open(std::string ContextName,
   return std::error_code();
 }
 
-LVReader *CurrentReader = nullptr;
+static LVReader *CurrentReader = nullptr;
 LVReader &LVReader::getInstance() {
   if (CurrentReader)
     return *CurrentReader;
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index a8bb34f..33ca46c 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -30,6 +30,8 @@
 #include "llvm/Support/Compiler.h"
 using namespace llvm;
 
+namespace llvm {
+
 // FIXME: Flag used for an ablation performance test, Issue #147390. Placing it
 // here because referencing IR should be feasible from anywhere. Will be
 // removed after the ablation test.
@@ -38,6 +40,8 @@ cl::opt<bool> ProfcheckDisableMetadataFixes(
     cl::desc(
         "Disable metadata propagation fixes discovered through Issue #147390"));
 
+} // end namespace llvm
+
 InsertPosition::InsertPosition(Instruction *InsertBefore)
     : InsertAt(InsertBefore ? InsertBefore->getIterator()
                             : InstListType::iterator()) {}
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index e5e062d..a347609 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -36,7 +36,7 @@
 
 using namespace llvm;
 
-cl::opt<bool> UseDerefAtPointSemantics(
+static cl::opt<bool> UseDerefAtPointSemantics(
     "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false),
     cl::desc("Deref attributes and metadata infer facts at definition only"));
 
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 7b25262..e6544f3 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -75,9 +75,10 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+namespace llvm {
 extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
-
 extern cl::opt<bool> ForceImportAll;
+} // end namespace llvm
 
 namespace llvm {
 /// Enable global value internalization in LTO.
diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp
index 0dd378e..329dcbf 100644
--- a/llvm/lib/Object/OffloadBundle.cpp
+++ b/llvm/lib/Object/OffloadBundle.cpp
@@ -120,14 +120,15 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset,
   if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle)
     return errorCodeToError(object_error::parse_failed);
 
-  OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, FileName);
+  std::unique_ptr<OffloadBundleFatBin> TheBundle(
+      new OffloadBundleFatBin(Buf, FileName));
 
   // Read the Bundle Entries
   Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset);
   if (Err)
     return Err;
 
-  return std::unique_ptr<OffloadBundleFatBin>(TheBundle);
+  return std::move(TheBundle);
 }
 
 Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) {
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 256cf9d..7069e8d 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -150,6 +150,8 @@
 
 using namespace llvm;
 
+namespace llvm {
+
 static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
     "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
     cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"),
@@ -305,7 +307,6 @@ static cl::opt<std::string> InstrumentColdFuncOnlyPath(
 extern cl::opt<std::string> UseCtxProfile;
 extern cl::opt<bool> PGOInstrumentColdFunctionOnly;
 
-namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
 } // namespace llvm
 
@@ -610,7 +611,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Jump table to switch conversion.
   if (EnableJumpTableToSwitch)
-    FPM.addPass(JumpTableToSwitchPass());
+    FPM.addPass(JumpTableToSwitchPass(
+        /*InLTO=*/Phase == ThinOrFullLTOPhase::ThinLTOPostLink ||
+        Phase == ThinOrFullLTOPhase::FullLTOPostLink));
 
   FPM.addPass(
       SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
diff --git a/llvm/lib/ProfileData/MemProfCommon.cpp b/llvm/lib/ProfileData/MemProfCommon.cpp
index a13a291..cfd2efd 100644
--- a/llvm/lib/ProfileData/MemProfCommon.cpp
+++ b/llvm/lib/ProfileData/MemProfCommon.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 using namespace llvm::memprof;
 
+namespace llvm {
+
 // Upper bound on lifetime access density (accesses per byte per lifetime sec)
 // for marking an allocation cold.
 LLVM_ABI cl::opt<float> MemProfLifetimeAccessDensityColdThreshold(
@@ -48,6 +50,8 @@ LLVM_ABI cl::opt<bool>
                        cl::desc("Enable use of hot hints (only supported for "
                                 "unambigously hot allocations)"));
 
+} // end namespace llvm
+
 AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity,
                                            uint64_t AllocCount,
                                            uint64_t TotalLifetime) {
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ab5c6f3..f5f7b65 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -324,7 +324,37 @@ AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF,
 static bool produceCompactUnwindFrame(const AArch64FrameLowering &,
                                       MachineFunction &MF);
 
-// Conservatively, returns true if the function is likely to have an SVE vectors
+enum class AssignObjectOffsets { No, Yes };
+/// Process all the SVE stack objects and the SVE stack size and offsets for
+/// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE
+/// stack sizes set). Returns the size of the SVE stack.
+static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
+                                            AssignObjectOffsets AssignOffsets,
+                                            bool SplitSVEObjects = false);
+
+static unsigned getStackHazardSize(const MachineFunction &MF) {
+  return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
+}
+
+/// Returns true if PPRs are spilled as ZPRs.
+static bool arePPRsSpilledAsZPR(const MachineFunction &MF) {
+  return MF.getSubtarget().getRegisterInfo()->getSpillSize(
+             AArch64::PPRRegClass) == 16;
+}
+
+StackOffset
+AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  return StackOffset::getScalable(AFI->getStackSizeZPR());
+}
+
+StackOffset
+AArch64FrameLowering::getPPRStackSize(const MachineFunction &MF) const {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  return StackOffset::getScalable(AFI->getStackSizePPR());
+}
+
+// Conservatively, returns true if the function is likely to have SVE vectors
 // on the stack. This function is safe to be called before callee-saves or
 // object offsets have been determined.
 static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
@@ -338,7 +368,7 @@ static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) {
-    if (MFI.getStackID(FI) == TargetStackID::ScalableVector)
+    if (MFI.isScalableStackID(FI))
       return true;
   }
 
@@ -482,13 +512,6 @@ AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF,
   }
 }
 
-/// Returns the size of the entire SVE stackframe (calleesaves + spills).
-StackOffset
-AArch64FrameLowering::getSVEStackSize(const MachineFunction &MF) const {
-  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
-}
-
 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   if (!EnableRedZone)
     return false;
@@ -514,7 +537,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
                                  !Subtarget.hasSVE();
 
   return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
-           getSVEStackSize(MF) || LowerQRegCopyThroughMem);
+           AFI->hasSVEStackSize() || LowerQRegCopyThroughMem);
 }
 
 /// hasFPImpl - Return true if the specified function should have a dedicated
@@ -557,7 +580,7 @@ bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
   // CFA in either of these cases.
   if (AFI.needsDwarfUnwindInfo(MF) &&
       ((requiresSaveVG(MF) || AFI.getSMEFnAttrs().hasStreamingBody()) &&
-       (!AFI.hasCalculatedStackSizeSVE() || AFI.getStackSizeSVE() > 0)))
+       (!AFI.hasCalculatedStackSizeSVE() || AFI.hasSVEStackSize())))
     return true;
   // With large callframes around we may need to use FP to access the scavenging
   // emergency spillslot.
@@ -1126,10 +1149,6 @@ static bool isTargetWindows(const MachineFunction &MF) {
   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
 }
 
-static unsigned getStackHazardSize(const MachineFunction &MF) {
-  return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
-}
-
 void AArch64FrameLowering::emitPacRetPlusLeafHardening(
     MachineFunction &MF) const {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
@@ -1212,7 +1231,9 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
   const auto &MFI = MF.getFrameInfo();
 
   int64_t ObjectOffset = MFI.getObjectOffset(FI);
-  StackOffset SVEStackSize = getSVEStackSize(MF);
+  StackOffset ZPRStackSize = getZPRStackSize(MF);
+  StackOffset PPRStackSize = getPPRStackSize(MF);
+  StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
 
   // For VLA-area objects, just emit an offset at the end of the stack frame.
   // Whilst not quite correct, these objects do live at the end of the frame and
@@ -1228,7 +1249,7 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool FPAfterSVECalleeSaves =
       isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
-  if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+  if (MFI.isScalableStackID(FI)) {
     if (FPAfterSVECalleeSaves &&
         -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize())
       return StackOffset::getScalable(ObjectOffset);
@@ -1294,7 +1315,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
   const auto &MFI = MF.getFrameInfo();
   int64_t ObjectOffset = MFI.getObjectOffset(FI);
   bool isFixed = MFI.isFixedObjectIndex(FI);
-  bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
+  bool isSVE = MFI.isScalableStackID(FI);
   return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
                                      PreferFP, ForSimm);
 }
@@ -1313,7 +1334,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
   bool isCSR =
       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
 
-  const StackOffset &SVEStackSize = getSVEStackSize(MF);
+  const StackOffset SVEStackSize = getSVEStackSize(MF);
 
   // Use frame pointer to reference fixed objects. Use it for locals if
   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
@@ -1615,10 +1636,13 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
     FirstReg = Count - 1;
   }
   bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize();
-  int ScalableByteOffset =
-      FPAfterSVECalleeSaves ? 0 : AFI->getSVECalleeSavedStackSize();
+  int ScalableByteOffset = FPAfterSVECalleeSaves
+                               ? 0
+                               : AFI->getZPRCalleeSavedStackSize() +
+                                     AFI->getPPRCalleeSavedStackSize();
   bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
   Register LastReg = 0;
+  bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex();
 
   // When iterating backwards, the loop condition relies on unsigned wraparound.
   for (unsigned i = FirstReg; i < Count; i += RegInc) {
@@ -1648,7 +1672,7 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
     }
 
     // Add the stack hazard size as we transition from GPR->FPR CSRs.
-    if (AFI->hasStackHazardSlotIndex() &&
+    if (HasCSHazardPadding &&
         (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
         AArch64InstrInfo::isFpOrNEON(RPI.Reg1))
       ByteOffset += StackFillDir * StackHazardSize;
@@ -1656,7 +1680,7 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
 
     int Scale = TRI->getSpillSize(*RPI.RC);
     // Add the next reg to the pair if it is in the same register class.
-    if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
+    if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) {
       MCRegister NextReg = CSI[i + RegInc].getReg();
       bool IsFirst = i == FirstReg;
       switch (RPI.Type) {
@@ -2021,10 +2045,14 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     }
     // Update the StackIDs of the SVE stack slots.
     MachineFrameInfo &MFI = MF.getFrameInfo();
-    if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
+    if (RPI.Type == RegPairInfo::ZPR) {
       MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
       if (RPI.isPaired())
         MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+    } else if (RPI.Type == RegPairInfo::PPR) {
+      MFI.setStackID(FrameIdxReg1, TargetStackID::ScalablePredicateVector);
+      if (RPI.isPaired())
+        MFI.setStackID(FrameIdxReg2, TargetStackID::ScalablePredicateVector);
     }
   }
   return true;
@@ -2232,8 +2260,7 @@ void AArch64FrameLowering::determineStackHazardSlot(
       for (auto &MI : MBB) {
         std::optional<int> FI = getLdStFrameID(MI, MFI);
         if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
-          if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
-              AArch64InstrInfo::isFpOrNEON(MI))
+          if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI))
             FrameObjects[*FI] |= 2;
           else
             FrameObjects[*FI] |= 1;
@@ -2260,10 +2287,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
-  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned UnspilledCSGPR = AArch64::NoRegister;
   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
@@ -2384,15 +2412,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // Calculates the callee saved stack size.
   unsigned CSStackSize = 0;
-  unsigned SVECSStackSize = 0;
+  unsigned ZPRCSStackSize = 0;
+  unsigned PPRCSStackSize = 0;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   for (unsigned Reg : SavedRegs.set_bits()) {
     auto *RC = TRI->getMinimalPhysRegClass(Reg);
     assert(RC && "expected register class!");
     auto SpillSize = TRI->getSpillSize(*RC);
-    if (AArch64::PPRRegClass.contains(Reg) ||
-        AArch64::ZPRRegClass.contains(Reg))
-      SVECSStackSize += SpillSize;
+    bool IsZPR = AArch64::ZPRRegClass.contains(Reg);
+    bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg);
+    if (IsZPR || (IsPPR && arePPRsSpilledAsZPR(MF)))
+      ZPRCSStackSize += SpillSize;
+    else if (IsPPR)
+      PPRCSStackSize += SpillSize;
     else
       CSStackSize += SpillSize;
   }
@@ -2402,17 +2434,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // only 64-bit GPRs can be added to SavedRegs.
   unsigned NumSavedRegs = SavedRegs.count();
 
-  // Increase the callee-saved stack size if the function has streaming mode
-  // changes, as we will need to spill the value of the VG register.
-  if (requiresSaveVG(MF))
-    CSStackSize += 8;
-
   // Determine if a Hazard slot should be used, and increase the CSStackSize by
   // StackHazardSize if so.
   determineStackHazardSlot(MF, SavedRegs);
   if (AFI->hasStackHazardSlotIndex())
     CSStackSize += getStackHazardSize(MF);
 
+  // Increase the callee-saved stack size if the function has streaming mode
+  // changes, as we will need to spill the value of the VG register.
+  if (requiresSaveVG(MF))
+    CSStackSize += 8;
+
   // If we must call __arm_get_current_vg in the prologue preserve the LR.
   if (requiresSaveVG(MF) && !Subtarget.hasSVE())
     SavedRegs.set(AArch64::LR);
@@ -2433,8 +2465,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   });
 
   // If any callee-saved registers are used, the frame cannot be eliminated.
-  int64_t SVEStackSize =
-      alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
+  auto [ZPRLocalStackSize, PPRLocalStackSize] =
+      determineSVEStackSizes(MF, AssignObjectOffsets::No);
+  uint64_t SVELocals = ZPRLocalStackSize + PPRLocalStackSize;
+  uint64_t SVEStackSize =
+      alignTo(ZPRCSStackSize + PPRCSStackSize + SVELocals, 16);
   bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
 
   // The CSR spill slots have not been allocated yet, so estimateStackSize
@@ -2519,7 +2554,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // instructions.
   AFI->setCalleeSavedStackSize(AlignedCSStackSize);
   AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
-  AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
+  AFI->setSVECalleeSavedStackSize(ZPRCSStackSize, alignTo(PPRCSStackSize, 16));
 }
 
 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
@@ -2658,7 +2693,6 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
       assert((Max == std::numeric_limits<int>::min() ||
               Max + 1 == CS.getFrameIdx()) &&
              "SVE CalleeSaves are not consecutive");
-
       Min = std::min(Min, CS.getFrameIdx());
       Max = std::max(Max, CS.getFrameIdx());
     }
@@ -2666,43 +2700,65 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
   return Min != std::numeric_limits<int>::max();
 }
 
-// Process all the SVE stack objects and determine offsets for each
-// object. If AssignOffsets is true, the offsets get assigned.
-// Fills in the first and last callee-saved frame indices into
-// Min/MaxCSFrameIndex, respectively.
-// Returns the size of the stack.
-static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
-                                              int &MinCSFrameIndex,
-                                              int &MaxCSFrameIndex,
-                                              bool AssignOffsets) {
+static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
+                                            AssignObjectOffsets AssignOffsets,
+                                            bool SplitSVEObjects) {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  SVEStackSizes SVEStack{};
+
+  // With SplitSVEObjects we maintain separate stack offsets for predicates
+  // (PPRs) and SVE vectors (ZPRs). When SplitSVEObjects is disabled predicates
+  // are included in the SVE vector area.
+  uint64_t &ZPRStackTop = SVEStack.ZPRStackSize;
+  uint64_t &PPRStackTop =
+      SplitSVEObjects ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize;
+
 #ifndef NDEBUG
   // First process all fixed stack objects.
   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
-    assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
+    assert(!MFI.isScalableStackID(I) &&
            "SVE vectors should never be passed on the stack by value, only by "
            "reference.");
 #endif
 
-  auto Assign = [&MFI](int FI, int64_t Offset) {
+  auto AllocateObject = [&](int FI) {
+    uint64_t &StackTop = MFI.getStackID(FI) == TargetStackID::ScalableVector
+                             ? ZPRStackTop
+                             : PPRStackTop;
+
+    // FIXME: Given that the length of SVE vectors is not necessarily a power of
+    // two, we'd need to align every object dynamically at runtime if the
+    // alignment is larger than 16. This is not yet supported.
+    Align Alignment = MFI.getObjectAlign(FI);
+    if (Alignment > Align(16))
+      report_fatal_error(
+          "Alignment of scalable vectors > 16 bytes is not yet supported");
+
+    StackTop += MFI.getObjectSize(FI);
+    StackTop = alignTo(StackTop, Alignment);
+
+    assert(StackTop < std::numeric_limits<int64_t>::max() &&
+           "SVE StackTop far too large?!");
+
+    int64_t Offset = -int64_t(StackTop);
+    if (AssignOffsets == AssignObjectOffsets::Yes)
+      MFI.setObjectOffset(FI, Offset);
+
     LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
-    MFI.setObjectOffset(FI, Offset);
   };
 
-  int64_t Offset = 0;
-
   // Then process all callee saved slots.
+  int MinCSFrameIndex, MaxCSFrameIndex;
   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
-    // Assign offsets to the callee save slots.
-    for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
-      Offset += MFI.getObjectSize(I);
-      Offset = alignTo(Offset, MFI.getObjectAlign(I));
-      if (AssignOffsets)
-        Assign(I, -Offset);
-    }
+    for (int FI = MinCSFrameIndex; FI <= MaxCSFrameIndex; ++FI)
+      AllocateObject(FI);
   }
 
-  // Ensure that the Callee-save area is aligned to 16bytes.
-  Offset = alignTo(Offset, Align(16U));
+  // Ensure the CS area is 16-byte aligned.
+  PPRStackTop = alignTo(PPRStackTop, Align(16U));
+  ZPRStackTop = alignTo(ZPRStackTop, Align(16U));
 
   // Create a buffer of SVE objects to allocate and sort it.
   SmallVector<int, 8> ObjectsToAllocate;
@@ -2715,48 +2771,31 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
     if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
       ObjectsToAllocate.push_back(StackProtectorFI);
   }
-  for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
-    unsigned StackID = MFI.getStackID(I);
-    if (StackID != TargetStackID::ScalableVector)
-      continue;
-    if (I == StackProtectorFI)
+
+  for (int FI = 0, E = MFI.getObjectIndexEnd(); FI != E; ++FI) {
+    if (FI == StackProtectorFI || MFI.isDeadObjectIndex(FI))
       continue;
-    if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
+    if (MaxCSFrameIndex >= FI && FI >= MinCSFrameIndex)
       continue;
-    if (MFI.isDeadObjectIndex(I))
+
+    if (MFI.getStackID(FI) != TargetStackID::ScalableVector &&
+        MFI.getStackID(FI) != TargetStackID::ScalablePredicateVector)
       continue;
 
-    ObjectsToAllocate.push_back(I);
+    ObjectsToAllocate.push_back(FI);
   }
 
   // Allocate all SVE locals and spills
-  for (unsigned FI : ObjectsToAllocate) {
-    Align Alignment = MFI.getObjectAlign(FI);
-    // FIXME: Given that the length of SVE vectors is not necessarily a power of
-    // two, we'd need to align every object dynamically at runtime if the
-    // alignment is larger than 16. This is not yet supported.
-    if (Alignment > Align(16))
-      report_fatal_error(
-          "Alignment of scalable vectors > 16 bytes is not yet supported");
+  for (unsigned FI : ObjectsToAllocate)
+    AllocateObject(FI);
 
-    Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
-    if (AssignOffsets)
-      Assign(FI, -Offset);
-  }
+  PPRStackTop = alignTo(PPRStackTop, Align(16U));
+  ZPRStackTop = alignTo(ZPRStackTop, Align(16U));
 
-  return Offset;
-}
+  if (AssignOffsets == AssignObjectOffsets::Yes)
+    AFI->setStackSizeSVE(SVEStack.ZPRStackSize, SVEStack.PPRStackSize);
 
-int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
-    MachineFrameInfo &MFI) const {
-  int MinCSFrameIndex, MaxCSFrameIndex;
-  return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
-}
-
-int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
-    MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
-  return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
-                                        true);
+  return SVEStack;
 }
 
 /// Attempts to scavenge a register from \p ScavengeableRegs given the used
@@ -3070,12 +3109,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
          "Upwards growing stack unsupported");
 
-  int MinCSFrameIndex, MaxCSFrameIndex;
-  int64_t SVEStackSize =
-      assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
-
-  AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
-  AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
+  (void)determineSVEStackSizes(MF, AssignObjectOffsets::Yes);
 
   // If this function isn't doing Win64-style C++ EH, we don't need to do
   // anything.
@@ -3597,7 +3631,7 @@ StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
 
   // Go to common code if we cannot provide sp + offset.
   if (MFI.hasVarSizedObjects() ||
-      MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
+      MF.getInfo<AArch64FunctionInfo>()->hasSVEStackSize() ||
       MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
     return getFrameIndexReference(MF, FI, FrameReg);
 
@@ -3721,8 +3755,7 @@ void AArch64FrameLowering::orderFrameObjects(
       if (AFI.hasStackHazardSlotIndex()) {
         std::optional<int> FI = getLdStFrameID(MI, MFI);
         if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
-          if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
-              AArch64InstrInfo::isFpOrNEON(MI))
+          if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI))
             FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
           else
             FrameObjects[*FI].Accesses |= FrameObject::AccessGPR;
@@ -4080,7 +4113,7 @@ void AArch64FrameLowering::emitRemarks(
           }
 
           unsigned RegTy = StackAccess::AccessType::GPR;
-          if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
+          if (MFI.isScalableStackID(FrameIdx)) {
             // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
             // spill/fill the predicate as a data vector (so are an FPR access).
             if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 7bba053..38aa28b1 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -24,6 +24,11 @@ class AArch64FunctionInfo;
 class AArch64PrologueEmitter;
 class AArch64EpilogueEmitter;
 
+struct SVEStackSizes {
+  uint64_t ZPRStackSize{0};
+  uint64_t PPRStackSize{0};
+};
+
 class AArch64FrameLowering : public TargetFrameLowering {
 public:
   explicit AArch64FrameLowering()
@@ -124,6 +129,7 @@ public:
       return false;
     case TargetStackID::Default:
     case TargetStackID::ScalableVector:
+    case TargetStackID::ScalablePredicateVector:
     case TargetStackID::NoAlloc:
       return true;
     }
@@ -132,7 +138,8 @@ public:
   bool isStackIdSafeForLocalArea(unsigned StackId) const override {
     // We don't support putting SVE objects into the pre-allocated local
     // frame block at the moment.
-    return StackId != TargetStackID::ScalableVector;
+    return (StackId != TargetStackID::ScalableVector &&
+            StackId != TargetStackID::ScalablePredicateVector);
   }
 
   void
@@ -145,7 +152,16 @@ public:
 
   bool requiresSaveVG(const MachineFunction &MF) const;
 
-  StackOffset getSVEStackSize(const MachineFunction &MF) const;
+  /// Returns the size of the entire ZPR stackframe (calleesaves + spills).
+  StackOffset getZPRStackSize(const MachineFunction &MF) const;
+
+  /// Returns the size of the entire PPR stackframe (calleesaves + spills).
+  StackOffset getPPRStackSize(const MachineFunction &MF) const;
+
+  /// Returns the size of the entire SVE stackframe (PPRs + ZPRs).
+  StackOffset getSVEStackSize(const MachineFunction &MF) const {
+    return getZPRStackSize(MF) + getPPRStackSize(MF);
+  }
 
   friend class AArch64PrologueEpilogueCommon;
   friend class AArch64PrologueEmitter;
@@ -165,10 +181,6 @@ private:
   /// Returns true if CSRs should be paired.
   bool producePairRegisters(MachineFunction &MF) const;
 
-  int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const;
-  int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
-                                      int &MinCSFrameIndex,
-                                      int &MaxCSFrameIndex) const;
   /// Make a determination whether a Hazard slot is used and create it if
   /// needed.
   void determineStackHazardSlot(MachineFunction &MF,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6a1b06e..35bbb0c0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2089,7 +2089,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
 
-  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4), Chain};
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
 
@@ -2110,14 +2111,15 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
 void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
                                                 unsigned NumOutVecs,
                                                 unsigned Opc) {
-
   SDValue ZtValue;
-  SmallVector<SDValue, 4> Ops;
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
 
-  Ops.push_back(ZtValue);
-  Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)}));
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ops[] = {ZtValue,
+                   createZMulTuple({Node->getOperand(3), Node->getOperand(4)}),
+                   Chain};
+
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
 
@@ -7495,7 +7497,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
     // We can only encode VL scaled offsets, so only fold in frame indexes
     // referencing SVE objects.
-    if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+    if (MFI.isScalableStackID(FI)) {
       Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
       OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
       return true;
@@ -7541,7 +7543,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
     int FI = cast<FrameIndexSDNode>(Base)->getIndex();
     // We can only encode VL scaled offsets, so only fold in frame indexes
     // referencing SVE objects.
-    if (MFI.getStackID(FI) == TargetStackID::ScalableVector)
+    if (MFI.isScalableStackID(FI))
       Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 45f5235..c2a482a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1537,6 +1537,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
+      setOperationAction(ISD::MSTORE, VT, Legal);
       setOperationAction(ISD::MUL, VT, Custom);
       setOperationAction(ISD::MULHS, VT, Custom);
       setOperationAction(ISD::MULHU, VT, Custom);
@@ -6617,7 +6618,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
           "llvm.eh.recoverfp must take a function as the first argument");
     return IncomingFPOp;
   }
-
   case Intrinsic::aarch64_neon_vsri:
   case Intrinsic::aarch64_neon_vsli:
   case Intrinsic::aarch64_sve_sri:
@@ -9256,8 +9256,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
       (MI.getOpcode() == AArch64::ADDXri ||
        MI.getOpcode() == AArch64::SUBXri)) {
     const MachineOperand &MO = MI.getOperand(1);
-    if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
-                         TargetStackID::ScalableVector)
+    if (MO.isFI() && MF.getFrameInfo().isScalableStackID(MO.getIndex()))
       MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
                                               /*IsImplicit=*/true));
   }
@@ -9704,8 +9703,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
       MachineFrameInfo &MFI = MF.getFrameInfo();
       int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
-      if (isScalable)
-        MFI.setStackID(FI, TargetStackID::ScalableVector);
+      if (isScalable) {
+        bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
+                      VA.getValVT().getVectorElementType() == MVT::i1;
+        MFI.setStackID(FI, IsPred ? TargetStackID::ScalablePredicateVector
+                                  : TargetStackID::ScalableVector);
+      }
 
       MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
       SDValue Ptr = DAG.getFrameIndex(
@@ -15154,9 +15157,7 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
                                : Shift.getOperand(1);
 
   unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
-  SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
-
-  return ResultSLI;
+  return DAG.getNode(Inst, DL, VT, X, Y, Imm);
 }
 
 static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG) {
@@ -27234,6 +27235,21 @@ static bool isLanes1toNKnownZero(SDValue Op) {
   }
 }
 
+// Return true if the vector operation can guarantee that the first lane of its
+// result is active.
+static bool isLane0KnownActive(SDValue Op) {
+  switch (Op.getOpcode()) {
+  default:
+    return false;
+  case AArch64ISD::REINTERPRET_CAST:
+    return isLane0KnownActive(Op->getOperand(0));
+  case ISD::SPLAT_VECTOR:
+    return isOneConstant(Op.getOperand(0));
+  case AArch64ISD::PTRUE:
+    return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all;
+  };
+}
+
 static SDValue removeRedundantInsertVectorElt(SDNode *N) {
   assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
   SDValue InsertVec = N->getOperand(0);
@@ -27519,6 +27535,32 @@ static SDValue performMULLCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue performPTestFirstCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  SDLoc DL(N);
+  auto Mask = N->getOperand(0);
+  auto Pred = N->getOperand(1);
+
+  if (!isLane0KnownActive(Mask))
+    return SDValue();
+
+  if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST)
+    Pred = Pred->getOperand(0);
+
+  if (Pred->getOpcode() == ISD::CONCAT_VECTORS) {
+    Pred = Pred->getOperand(0);
+    Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred);
+    return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask,
+                       Pred);
+  }
+
+  return SDValue();
+}
+
 static SDValue
 performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                              SelectionDAG &DAG) {
@@ -27875,6 +27917,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case AArch64ISD::UMULL:
   case AArch64ISD::PMULL:
     return performMULLCombine(N, DCI, DAG);
+  case AArch64ISD::PTEST_FIRST:
+    return performPTestFirstCombine(N, DCI, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (N->getConstantOperandVal(1)) {
@@ -29564,7 +29608,7 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
   // than doing it here in finalizeLowering.
   if (MFI.hasStackProtectorIndex()) {
     for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
-      if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
+      if (MFI.isScalableStackID(i) &&
           MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
         MFI.setStackID(MFI.getStackProtectorIndex(),
                        TargetStackID::ScalableVector);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index f07d351..6ef0a95 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -10176,28 +10176,6 @@ multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
             (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
 }
 
-multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
 //----------------------------------------------------------------------------
 // AdvSIMD vector x indexed element
 //----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5a51c81..5a90da1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1503,6 +1503,13 @@ AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
             getElementSizeForOpcode(PredOpcode))
       return PredOpcode;
 
+    // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
+    // WHILEcc performs an implicit PTEST with an all active mask, setting
+    // the N flag as the PTEST_FIRST would.
+    if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
+        isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
+      return PredOpcode;
+
     return {};
   }
 
@@ -5592,7 +5599,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_PXI;
-      StackID = TargetStackID::ScalableVector;
+      StackID = TargetStackID::ScalablePredicateVector;
     }
     break;
   }
@@ -5607,7 +5614,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       Opc = AArch64::STRSui;
     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
       Opc = AArch64::STR_PPXI;
-      StackID = TargetStackID::ScalableVector;
+      StackID = TargetStackID::ScalablePredicateVector;
     }
     break;
   case 8:
@@ -5777,7 +5784,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       if (IsPNR)
         PNRReg = DestReg;
       Opc = AArch64::LDR_PXI;
-      StackID = TargetStackID::ScalableVector;
+      StackID = TargetStackID::ScalablePredicateVector;
     }
     break;
   }
@@ -5792,7 +5799,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       Opc = AArch64::LDRSui;
     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
       Opc = AArch64::LDR_PPXI;
-      StackID = TargetStackID::ScalableVector;
+      StackID = TargetStackID::ScalablePredicateVector;
     }
     break;
   case 8:
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index a81f5b3..b3c9656 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -23,12 +23,21 @@
 
 using namespace llvm;
 
+static std::optional<uint64_t>
+getSVEStackSize(const AArch64FunctionInfo &MFI,
+                uint64_t (AArch64FunctionInfo::*GetStackSize)() const) {
+  if (!MFI.hasCalculatedStackSizeSVE())
+    return std::nullopt;
+  return (MFI.*GetStackSize)();
+}
+
 yaml::AArch64FunctionInfo::AArch64FunctionInfo(
     const llvm::AArch64FunctionInfo &MFI)
     : HasRedZone(MFI.hasRedZone()),
-      StackSizeSVE(MFI.hasCalculatedStackSizeSVE()
-                       ? std::optional<uint64_t>(MFI.getStackSizeSVE())
-                       : std::nullopt),
+      StackSizeZPR(
+          getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizeZPR)),
+      StackSizePPR(
+          getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizePPR)),
       HasStackFrame(MFI.hasStackFrame()
                         ? std::optional<bool>(MFI.hasStackFrame())
                         : std::nullopt) {}
@@ -41,8 +50,9 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
     const yaml::AArch64FunctionInfo &YamlMFI) {
   if (YamlMFI.HasRedZone)
     HasRedZone = YamlMFI.HasRedZone;
-  if (YamlMFI.StackSizeSVE)
-    setStackSizeSVE(*YamlMFI.StackSizeSVE);
+  if (YamlMFI.StackSizeZPR || YamlMFI.StackSizePPR)
+    setStackSizeSVE(YamlMFI.StackSizeZPR.value_or(0),
+                    YamlMFI.StackSizePPR.value_or(0));
   if (YamlMFI.HasStackFrame)
     setHasStackFrame(*YamlMFI.HasStackFrame);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 897c7e8..4a79d9c 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -74,13 +74,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// Amount of stack frame size, not including callee-saved registers.
   uint64_t LocalStackSize = 0;
 
-  /// The start and end frame indices for the SVE callee saves.
-  int MinSVECSFrameIndex = 0;
-  int MaxSVECSFrameIndex = 0;
-
   /// Amount of stack frame size used for saving callee-saved registers.
   unsigned CalleeSavedStackSize = 0;
-  unsigned SVECalleeSavedStackSize = 0;
+  unsigned ZPRCalleeSavedStackSize = 0;
+  unsigned PPRCalleeSavedStackSize = 0;
   bool HasCalleeSavedStackSize = false;
   bool HasSVECalleeSavedStackSize = false;
 
@@ -137,9 +134,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// SVE stack size (for predicates and data vectors) are maintained here
   /// rather than in FrameInfo, as the placement and Stack IDs are target
   /// specific.
-  uint64_t StackSizeSVE = 0;
+  uint64_t StackSizeZPR = 0;
+  uint64_t StackSizePPR = 0;
 
-  /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid.
+  /// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid.
   bool HasCalculatedStackSizeSVE = false;
 
   /// Has a value when it is known whether or not the function uses a
@@ -312,16 +310,25 @@ public:
     TailCallReservedStack = bytes;
   }
 
-  bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
-
-  void setStackSizeSVE(uint64_t S) {
+  void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) {
+    StackSizeZPR = ZPR;
+    StackSizePPR = PPR;
     HasCalculatedStackSizeSVE = true;
-    StackSizeSVE = S;
   }
 
-  uint64_t getStackSizeSVE() const {
+  uint64_t getStackSizeZPR() const {
     assert(hasCalculatedStackSizeSVE());
-    return StackSizeSVE;
+    return StackSizeZPR;
+  }
+  uint64_t getStackSizePPR() const {
+    assert(hasCalculatedStackSizeSVE());
+    return StackSizePPR;
+  }
+
+  bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
+
+  bool hasSVEStackSize() const {
+    return getStackSizeZPR() > 0 || getStackSizePPR() > 0;
   }
 
   bool hasStackFrame() const { return HasStackFrame; }
@@ -414,23 +421,25 @@ public:
   }
 
   // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes'
-  void setSVECalleeSavedStackSize(unsigned Size) {
-    SVECalleeSavedStackSize = Size;
+  void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) {
+    ZPRCalleeSavedStackSize = ZPR;
+    PPRCalleeSavedStackSize = PPR;
     HasSVECalleeSavedStackSize = true;
   }
-  unsigned getSVECalleeSavedStackSize() const {
+  unsigned getZPRCalleeSavedStackSize() const {
     assert(HasSVECalleeSavedStackSize &&
-           "SVECalleeSavedStackSize has not been calculated");
-    return SVECalleeSavedStackSize;
+           "ZPRCalleeSavedStackSize has not been calculated");
+    return ZPRCalleeSavedStackSize;
   }
-
-  void setMinMaxSVECSFrameIndex(int Min, int Max) {
-    MinSVECSFrameIndex = Min;
-    MaxSVECSFrameIndex = Max;
+  unsigned getPPRCalleeSavedStackSize() const {
+    assert(HasSVECalleeSavedStackSize &&
+           "PPRCalleeSavedStackSize has not been calculated");
+    return PPRCalleeSavedStackSize;
   }
 
-  int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; }
-  int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; }
+  unsigned getSVECalleeSavedStackSize() const {
+    return getZPRCalleeSavedStackSize() + getPPRCalleeSavedStackSize();
+  }
 
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
   unsigned getNumLocalDynamicTLSAccesses() const {
@@ -611,7 +620,8 @@ private:
 namespace yaml {
 struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
   std::optional<bool> HasRedZone;
-  std::optional<uint64_t> StackSizeSVE;
+  std::optional<uint64_t> StackSizeZPR;
+  std::optional<uint64_t> StackSizePPR;
   std::optional<bool> HasStackFrame;
 
   AArch64FunctionInfo() = default;
@@ -624,7 +634,8 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
 template <> struct MappingTraits<AArch64FunctionInfo> {
   static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) {
     YamlIO.mapOptional("hasRedZone", MFI.HasRedZone);
-    YamlIO.mapOptional("stackSizeSVE", MFI.StackSizeSVE);
+    YamlIO.mapOptional("stackSizeZPR", MFI.StackSizeZPR);
+    YamlIO.mapOptional("stackSizePPR", MFI.StackSizePPR);
     YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame);
   }
 };
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 09b3643..aad6579 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -48,21 +48,19 @@ bool AArch64PrologueEpilogueCommon::isVGInstruction(
   return Opc == TargetOpcode::COPY;
 }
 
-// Convenience function to determine whether I is an SVE callee save.
-static bool isSVECalleeSave(MachineBasicBlock::iterator I) {
+// Convenience function to determine whether I is part of the ZPR callee saves.
+static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) {
   switch (I->getOpcode()) {
   default:
     return false;
-  case AArch64::PTRUE_C_B:
   case AArch64::LD1B_2Z_IMM:
   case AArch64::ST1B_2Z_IMM:
   case AArch64::STR_ZXI:
-  case AArch64::STR_PXI:
   case AArch64::LDR_ZXI:
-  case AArch64::LDR_PXI:
-  case AArch64::PTRUE_B:
   case AArch64::CPY_ZPzI_B:
   case AArch64::CMPNE_PPzZI_B:
+  case AArch64::PTRUE_C_B:
+  case AArch64::PTRUE_B:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
   case AArch64::SEH_SavePReg:
@@ -71,6 +69,23 @@ static bool isSVECalleeSave(MachineBasicBlock::iterator I) {
   }
 }
 
+// Convenience function to determine whether I is part of the PPR callee saves.
+static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) {
+  switch (I->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STR_PXI:
+  case AArch64::LDR_PXI:
+    return I->getFlag(MachineInstr::FrameSetup) ||
+           I->getFlag(MachineInstr::FrameDestroy);
+  }
+}
+
+// Convenience function to determine whether I is part of the SVE callee saves.
+static bool isPartOfSVECalleeSaves(MachineBasicBlock::iterator I) {
+  return isPartOfZPRCalleeSaves(I) || isPartOfPPRCalleeSaves(I);
+}
+
 AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon(
     MachineFunction &MF, MachineBasicBlock &MBB,
     const AArch64FrameLowering &AFL)
@@ -316,7 +331,7 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump(
 
   // When there is an SVE area on the stack, always allocate the
   // callee-saves and spills/locals separately.
-  if (AFL.getSVEStackSize(MF))
+  if (AFI->hasSVEStackSize())
     return false;
 
   return true;
@@ -639,7 +654,7 @@ void AArch64PrologueEmitter::emitPrologue() {
 
     // Now allocate space for the GPR callee saves.
     MachineBasicBlock::iterator MBBI = PrologueBeginI;
-    while (MBBI != EndI && isSVECalleeSave(MBBI))
+    while (MBBI != EndI && isPartOfSVECalleeSaves(MBBI))
       ++MBBI;
     FirstGPRSaveI = convertCalleeSaveRestoreToSPPrePostIncDec(
         MBBI, DL, -AFI->getCalleeSavedStackSize(), EmitAsyncCFI);
@@ -669,7 +684,7 @@ void AArch64PrologueEmitter::emitPrologue() {
   MachineBasicBlock::iterator AfterGPRSavesI = FirstGPRSaveI;
   while (AfterGPRSavesI != EndI &&
          AfterGPRSavesI->getFlag(MachineInstr::FrameSetup) &&
-         !isSVECalleeSave(AfterGPRSavesI)) {
+         !isPartOfSVECalleeSaves(AfterGPRSavesI)) {
     if (CombineSPBump &&
         // Only fix-up frame-setup load/store instructions.
         (!AFL.requiresSaveVG(MF) || !isVGInstruction(AfterGPRSavesI, TLI)))
@@ -700,38 +715,66 @@ void AArch64PrologueEmitter::emitPrologue() {
   if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
     emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
 
-  StackOffset SVEStackSize = AFL.getSVEStackSize(MF);
-  StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
   MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI;
 
+  StackOffset PPRCalleeSavesSize =
+      StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
+  StackOffset ZPRCalleeSavesSize =
+      StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
+  StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize;
+  StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
+  StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
+
   StackOffset CFAOffset =
       StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
-
-  // Process the SVE callee-saves to determine what space needs to be
-  // allocated.
   MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
-  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
-    LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
-                      << "\n");
-    SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
-    SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
-    // Find callee save instructions in frame.
-    // Note: With FPAfterSVECalleeSaves the callee saves have already been
+
+  if (!FPAfterSVECalleeSaves) {
+    MachineBasicBlock::iterator ZPRCalleeSavesBegin = AfterGPRSavesI,
+                                ZPRCalleeSavesEnd = AfterGPRSavesI;
+    MachineBasicBlock::iterator PPRCalleeSavesBegin = AfterGPRSavesI,
+                                PPRCalleeSavesEnd = AfterGPRSavesI;
+
+    // Process the SVE callee-saves to determine what space needs to be
     // allocated.
-    if (!FPAfterSVECalleeSaves) {
-      MachineBasicBlock::iterator CalleeSavesBegin = AfterGPRSavesI;
-      assert(isSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
-      while (isSVECalleeSave(AfterSVESavesI) &&
+
+    if (PPRCalleeSavesSize) {
+      LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = "
+                        << PPRCalleeSavesSize.getScalable() << "\n");
+
+      PPRCalleeSavesBegin = AfterSVESavesI;
+      assert(isPartOfPPRCalleeSaves(PPRCalleeSavesBegin) &&
+             "Unexpected instruction");
+      while (isPartOfPPRCalleeSaves(AfterSVESavesI) &&
              AfterSVESavesI != MBB.getFirstTerminator())
         ++AfterSVESavesI;
-      CalleeSavesEnd = AfterSVESavesI;
+      PPRCalleeSavesEnd = AfterSVESavesI;
+    }
 
-      StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
-      // Allocate space for the callee saves (if any).
-      allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize,
-                         EmitAsyncCFI && !HasFP, CFAOffset,
-                         MFI.hasVarSizedObjects() || LocalsSize);
+    if (ZPRCalleeSavesSize) {
+      LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = "
+                        << ZPRCalleeSavesSize.getScalable() << "\n");
+      ZPRCalleeSavesBegin = AfterSVESavesI;
+      assert(isPartOfZPRCalleeSaves(ZPRCalleeSavesBegin) &&
+             "Unexpected instruction");
+      while (isPartOfZPRCalleeSaves(AfterSVESavesI) &&
+             AfterSVESavesI != MBB.getFirstTerminator())
+        ++AfterSVESavesI;
+      ZPRCalleeSavesEnd = AfterSVESavesI;
     }
+
+    // Allocate space for the callee saves (if any).
+    StackOffset LocalsSize =
+        PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes);
+    MachineBasicBlock::iterator CalleeSavesBegin =
+        AFI->getPPRCalleeSavedStackSize() ? PPRCalleeSavesBegin
+                                          : ZPRCalleeSavesBegin;
+    allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize,
+                       EmitAsyncCFI && !HasFP, CFAOffset,
+                       MFI.hasVarSizedObjects() || LocalsSize);
+
+    CalleeSavesEnd = AFI->getZPRCalleeSavedStackSize() ? ZPRCalleeSavesEnd
+                                                       : PPRCalleeSavesEnd;
   }
   CFAOffset += SVECalleeSavesSize;
 
@@ -746,6 +789,7 @@ void AArch64PrologueEmitter::emitPrologue() {
     // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
     // the correct value here, as NumBytes also includes padding bytes,
     // which shouldn't be counted here.
+    StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize;
     allocateStackSpace(CalleeSavesEnd, RealignmentPadding,
                        SVELocalsSize + StackOffset::getFixed(NumBytes),
                        EmitAsyncCFI && !HasFP, CFAOffset,
@@ -796,7 +840,8 @@ void AArch64PrologueEmitter::emitPrologue() {
       emitDefineCFAWithFP(AfterSVESavesI, FixedObject);
     } else {
       StackOffset TotalSize =
-          SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
+          AFL.getSVEStackSize(MF) +
+          StackOffset::getFixed((int64_t)MFI.getStackSize());
       CFIInstBuilder CFIBuilder(MBB, AfterSVESavesI, MachineInstr::FrameSetup);
       CFIBuilder.insertCFIInst(
           createDefCFA(RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP,
@@ -1165,7 +1210,7 @@ void AArch64PrologueEmitter::emitCalleeSavedGPRLocations(
   CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
   for (const auto &Info : CSI) {
     unsigned FrameIdx = Info.getFrameIdx();
-    if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
+    if (MFI.isScalableStackID(FrameIdx))
       continue;
 
     assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
@@ -1192,7 +1237,7 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
   }
 
   for (const auto &Info : CSI) {
-    if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector)
+    if (!MFI.isScalableStackID(Info.getFrameIdx()))
       continue;
 
     // Not all unwinders may know about SVE registers, so assume the lowest
@@ -1322,7 +1367,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   while (FirstGPRRestoreI != Begin) {
     --FirstGPRRestoreI;
     if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
-        (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) {
+        (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
       ++FirstGPRRestoreI;
       break;
     } else if (CombineSPBump)
@@ -1346,7 +1391,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   if (HasFP && AFI->hasSwiftAsyncContext())
     emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
 
-  const StackOffset &SVEStackSize = AFL.getSVEStackSize(MF);
+  StackOffset SVEStackSize = AFL.getSVEStackSize(MF);
 
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
@@ -1372,20 +1417,25 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
   MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
                               RestoreEnd = FirstGPRRestoreI;
-  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+  int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize();
+  int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize();
+  int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize;
+
+  if (SVECalleeSavedSize) {
     if (FPAfterSVECalleeSaves)
       RestoreEnd = MBB.getFirstTerminator();
 
     RestoreBegin = std::prev(RestoreEnd);
     while (RestoreBegin != MBB.begin() &&
-           isSVECalleeSave(std::prev(RestoreBegin)))
+           isPartOfSVECalleeSaves(std::prev(RestoreBegin)))
       --RestoreBegin;
 
-    assert(isSVECalleeSave(RestoreBegin) &&
-           isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
+    assert(isPartOfSVECalleeSaves(RestoreBegin) &&
+           isPartOfSVECalleeSaves(std::prev(RestoreEnd)) &&
+           "Unexpected instruction");
 
     StackOffset CalleeSavedSizeAsOffset =
-        StackOffset::getScalable(CalleeSavedSize);
+        StackOffset::getScalable(SVECalleeSavedSize);
     DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
     DeallocateAfter = CalleeSavedSizeAsOffset;
   }
@@ -1624,8 +1674,7 @@ void AArch64EpilogueEmitter::emitCalleeSavedRestores(
   CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy);
 
   for (const auto &Info : CSI) {
-    if (SVE !=
-        (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
+    if (SVE != MFI.isScalableStackID(Info.getFrameIdx()))
       continue;
 
     MCRegister Reg = Info.getReg();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 2b0c8ad..3f43b70 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -643,7 +643,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
     if (ST.hasSVE() || ST.isStreaming()) {
       // Frames that have variable sized objects and scalable SVE objects,
       // should always use a basepointer.
-      if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE())
+      if (!AFI->hasCalculatedStackSizeSVE() || AFI->hasSVEStackSize())
         return true;
     }
 
@@ -783,7 +783,7 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
   assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() ||
           AFI->hasCalculatedStackSizeSVE()) &&
          "Expected SVE area to be calculated by this point");
-  return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE() &&
+  return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->hasSVEStackSize() &&
          !AFI->hasStackHazardSlotIndex();
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 7ee54c5..c197550e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -438,7 +438,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2,
                                G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10,
                                G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH,
-                               G_FSINH, G_FTANH})
+                               G_FSINH, G_FTANH, G_FMODF})
       // We need a call for these, so we always need to scalarize.
       .scalarize(0)
       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b..280fbe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
   if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
     addPass(createAMDGPULowerKernelArgumentsPass());
 
+  TargetPassConfig::addCodeGenPrepare();
+
+  if (isPassEnabled(EnableLoadStoreVectorizer))
+    addPass(createLoadStoreVectorizerPass());
+
   if (TM->getTargetTriple().isAMDGCN()) {
     // This lowering has been placed after codegenprepare to take advantage of
     // address mode matching (which is why it isn't put with the LDS lowerings).
@@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
     // but has been put before switch lowering and CFG flattening so that those
     // passes can run on the more optimized control flow this pass creates in
     // many cases.
-    //
-    // FIXME: This should ideally be put after the LoadStoreVectorizer.
-    // However, due to some annoying facts about ResourceUsageAnalysis,
-    // (especially as exercised in the resource-usage-dead-function test),
-    // we need all the function passes codegenprepare all the way through
-    // said resource usage analysis to run on the call graph produced
-    // before codegenprepare runs (because codegenprepare will knock some
-    // nodes out of the graph, which leads to function-level passes not
-    // being run on them, which causes crashes in the resource usage analysis).
     addPass(createAMDGPULowerBufferFatPointersPass());
     addPass(createAMDGPULowerIntrinsicsLegacyPass());
     // In accordance with the above FIXME, manually force all the
@@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
     addPass(new DummyCGSCCPass());
   }
 
-  TargetPassConfig::addCodeGenPrepare();
-
-  if (isPassEnabled(EnableLoadStoreVectorizer))
-    addPass(createLoadStoreVectorizerPass());
-
   // LowerSwitch pass may introduce unreachable blocks that can
   // cause unexpected behavior for subsequent passes. Placing it
   // here seems better that these blocks would get cleaned up by
@@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
   if (EnableLowerKernelArguments)
     addPass(AMDGPULowerKernelArgumentsPass(TM));
 
+  Base::addCodeGenPrepare(addPass);
+
+  if (isPassEnabled(EnableLoadStoreVectorizer))
+    addPass(LoadStoreVectorizerPass());
+
   // This lowering has been placed after codegenprepare to take advantage of
   // address mode matching (which is why it isn't put with the LDS lowerings).
   // It could be placed anywhere before uniformity annotations (an analysis
@@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
   // but has been put before switch lowering and CFG flattening so that those
   // passes can run on the more optimized control flow this pass creates in
   // many cases.
-  //
-  // FIXME: This should ideally be put after the LoadStoreVectorizer.
-  // However, due to some annoying facts about ResourceUsageAnalysis,
-  // (especially as exercised in the resource-usage-dead-function test),
-  // we need all the function passes codegenprepare all the way through
-  // said resource usage analysis to run on the call graph produced
-  // before codegenprepare runs (because codegenprepare will knock some
-  // nodes out of the graph, which leads to function-level passes not
-  // being run on them, which causes crashes in the resource usage analysis).
   addPass(AMDGPULowerBufferFatPointersPass(TM));
   addPass.requireCGSCCOrder();
 
   addPass(AMDGPULowerIntrinsicsPass(TM));
 
-  Base::addCodeGenPrepare(addPass);
-
-  if (isPassEnabled(EnableLoadStoreVectorizer))
-    addPass(LoadStoreVectorizerPass());
-
   // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
   // behavior for subsequent passes. Placing it here seems better that these
   // blocks would get cleaned up by UnreachableBlockElim inserted next in the
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7c5d4fc..e4b3528 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -924,6 +924,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
   case TargetStackID::SGPRSpill:
     return true;
   case TargetStackID::ScalableVector:
+  case TargetStackID::ScalablePredicateVector:
   case TargetStackID::WasmLocal:
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1653008..f7265c5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -64,14 +64,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
     cl::desc("Use indirect register addressing for divergent indexes"),
     cl::init(false));
 
-// TODO: This option should be removed once we switch to always using PTRADD in
-// the SelectionDAG.
-static cl::opt<bool> UseSelectionDAGPTRADD(
-    "amdgpu-use-sdag-ptradd", cl::Hidden,
-    cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
-             "SelectionDAG ISel"),
-    cl::init(false));
-
 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
@@ -11466,7 +11458,7 @@ static bool isNoUnsignedWrap(SDValue Addr) {
 
 bool SITargetLowering::shouldPreservePtrArith(const Function &F,
                                               EVT PtrVT) const {
-  return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
+  return PtrVT == MVT::i64;
 }
 
 bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 45d194e..939841a 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -2804,6 +2804,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vL32b_nt_cur_npred_ai:
   case Hexagon::V6_vL32b_nt_tmp_pred_ai:
   case Hexagon::V6_vL32b_nt_tmp_npred_ai:
+  case Hexagon::V6_vS32Ub_npred_ai:
   case Hexagon::V6_vgathermh_pseudo:
   case Hexagon::V6_vgathermw_pseudo:
   case Hexagon::V6_vgathermhw_pseudo:
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 06ce917..7d4535a 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -2395,6 +2395,7 @@ bool RISCVFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
   case TargetStackID::NoAlloc:
   case TargetStackID::SGPRSpill:
   case TargetStackID::WasmLocal:
+  case TargetStackID::ScalablePredicateVector:
     return false;
   }
   llvm_unreachable("Invalid TargetStackID::Value");
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 2e5f30f..19d5aff 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -109,8 +109,9 @@ def : LdPat<extloadi8, LBU, i16>; // Prefer unsigned due to no c.lb in Zcb.
 def : StPat<truncstorei8, SB, GPR, i16>;
 
 let Predicates = [HasAtomicLdSt] in {
-  def : LdPat<atomic_load_aext_8,  LB, i16>;
-  def : LdPat<atomic_load_nonext_16, LH, i16>;
+  // Prefer unsigned due to no c.lb in Zcb.
+  def : LdPat<atomic_load_aext_8,    LBU, i16>;
+  def : LdPat<atomic_load_nonext_16, LH,  i16>;
 
   def : StPat<atomic_store_8,  SB, GPR, i16>;
   def : StPat<atomic_store_16, SH, GPR, i16>;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 90e1c47a..6a6ead2 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -70,6 +70,10 @@ static unsigned getSEWOpNum(const MachineInstr &MI) {
   return RISCVII::getSEWOpNum(MI.getDesc());
 }
 
+static unsigned getVecPolicyOpNum(const MachineInstr &MI) {
+  return RISCVII::getVecPolicyOpNum(MI.getDesc());
+}
+
 /// Get the EEW for a load or store instruction.  Return std::nullopt if MI is
 /// not a load or store which ignores SEW.
 static std::optional<unsigned> getEEWForLoadStore(const MachineInstr &MI) {
@@ -986,7 +990,7 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 
     // If there is a policy operand, use it.
     if (RISCVII::hasVecPolicyOp(TSFlags)) {
-      const MachineOperand &Op = MI.getOperand(MI.getNumExplicitOperands() - 1);
+      const MachineOperand &Op = MI.getOperand(getVecPolicyOpNum(MI));
       uint64_t Policy = Op.getImm();
       assert(Policy <=
                  (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC) &&
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 59f5aeb..99992d1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -165,10 +165,11 @@ class seq_cst_store<PatFrag base>
 // any ordering. This is necessary because AtomicExpandPass has added fences to
 // atomic load/stores and changed them to unordered ones.
 let Predicates = [HasAtomicLdSt] in {
-  def : LdPat<relaxed_load<atomic_load_asext_8>,  LB>;
+  // Use unsigned for aext due to no c.lb in Zcb.
+  def : LdPat<relaxed_load<atomic_load_sext_8>,   LB>;
+  def : LdPat<relaxed_load<atomic_load_azext_8>,  LBU>;
   def : LdPat<relaxed_load<atomic_load_asext_16>, LH>;
-  def : LdPat<relaxed_load<atomic_load_zext_8>,  LBU>;
-  def : LdPat<relaxed_load<atomic_load_zext_16>, LHU>;
+  def : LdPat<relaxed_load<atomic_load_zext_16>,  LHU>;
 
   def : StPat<relaxed_store<atomic_store_8>,  SB, GPR, XLenVT>;
   def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 273edf3..0afec42 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -752,6 +752,8 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
     return selectExtInst(ResVReg, ResType, I, CL::exp, GL::Exp);
   case TargetOpcode::G_FEXP2:
     return selectExtInst(ResVReg, ResType, I, CL::exp2, GL::Exp2);
+  case TargetOpcode::G_FMODF:
+    return selectModf(ResVReg, ResType, I);
 
   case TargetOpcode::G_FLOG:
     return selectExtInst(ResVReg, ResType, I, CL::log, GL::Log);
@@ -3453,9 +3455,6 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_discard: {
     return selectDiscard(ResVReg, ResType, I);
   }
-  case Intrinsic::modf: {
-    return selectModf(ResVReg, ResType, I);
-  }
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
@@ -4268,6 +4267,7 @@ bool SPIRVInstructionSelector::selectModf(Register ResVReg,
         PtrTyReg,
         LLT::pointer(storageClassToAddressSpace(SPIRV::StorageClass::Function),
                      GR.getPointerSize()));
+
     // Assign SPIR-V type of the pointer type of the alloca variable to the
     // new register.
     GR.assignSPIRVTypeToVReg(PtrType, PtrTyReg, MIRBuilder.getMF());
@@ -4280,10 +4280,7 @@ bool SPIRVInstructionSelector::selectModf(Register ResVReg,
             .addUse(GR.getSPIRVTypeID(PtrType))
             .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function));
     Register Variable = AllocaMIB->getOperand(0).getReg();
-    // Modf must have 4 operands, the first two are the 2 parts of the result,
-    // the third is the operand, and the last one is the floating point value.
-    assert(I.getNumOperands() == 4 &&
-           "Expected 4 operands for modf instruction");
+
     MachineBasicBlock &BB = *I.getParent();
     // Create the OpenCLLIB::modf instruction.
     auto MIB =
@@ -4293,8 +4290,8 @@ bool SPIRVInstructionSelector::selectModf(Register ResVReg,
             .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::OpenCL_std))
             .addImm(CL::modf)
             .setMIFlags(I.getFlags())
-            .add(I.getOperand(3)) // Floating point value.
-            .addUse(Variable);    // Pointer to integral part.
+            .add(I.getOperand(I.getNumExplicitDefs())) // Floating point value.
+            .addUse(Variable); // Pointer to integral part.
     // Assign the integral part stored in the ptr to the second element of the
     // result.
     Register IntegralPartReg = I.getOperand(1).getReg();
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
index aea3397..205895e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
@@ -39,6 +39,7 @@ private:
   void collectBindingInfo(Module &M);
   uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet);
   void replaceImplicitBindingCalls(Module &M);
+  void verifyUniqueOrderIdPerResource(SmallVectorImpl<CallInst *> &Calls);
 
   // A map from descriptor set to a bit vector of used binding numbers.
   std::vector<BitVector> UsedBindings;
@@ -94,6 +95,33 @@ void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) {
       });
 }
 
+void SPIRVLegalizeImplicitBinding::verifyUniqueOrderIdPerResource(
+    SmallVectorImpl<CallInst *> &Calls) {
+  // Check that the order Id is unique per resource.
+  for (uint32_t i = 1; i < Calls.size(); ++i) {
+    const uint32_t OrderIdArgIdx = 0;
+    const uint32_t DescSetArgIdx = 1;
+    const uint32_t OrderA =
+        cast<ConstantInt>(Calls[i - 1]->getArgOperand(OrderIdArgIdx))
+            ->getZExtValue();
+    const uint32_t OrderB =
+        cast<ConstantInt>(Calls[i]->getArgOperand(OrderIdArgIdx))
+            ->getZExtValue();
+    if (OrderA == OrderB) {
+      const uint32_t DescSetA =
+          cast<ConstantInt>(Calls[i - 1]->getArgOperand(DescSetArgIdx))
+              ->getZExtValue();
+      const uint32_t DescSetB =
+          cast<ConstantInt>(Calls[i]->getArgOperand(DescSetArgIdx))
+              ->getZExtValue();
+      if (DescSetA != DescSetB) {
+        report_fatal_error("Implicit binding calls with the same order ID must "
+                           "have the same descriptor set");
+      }
+    }
+  }
+}
+
 uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding(
     uint32_t DescSet) {
   if (UsedBindings.size() <= DescSet) {
@@ -112,11 +140,23 @@ uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding(
 }
 
 void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) {
+  uint32_t lastOrderId = -1;
+  uint32_t lastBindingNumber = -1;
+
   for (CallInst *OldCI : ImplicitBindingCalls) {
     IRBuilder<> Builder(OldCI);
+    const uint32_t OrderId =
+        cast<ConstantInt>(OldCI->getArgOperand(0))->getZExtValue();
     const uint32_t DescSet =
         cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue();
-    const uint32_t NewBinding = getAndReserveFirstUnusedBinding(DescSet);
+
+    // Reuse an existing binding for this order ID, if one was already assigned.
+    // Otherwise, assign a new binding.
+    const uint32_t NewBinding = (lastOrderId == OrderId)
+                                    ? lastBindingNumber
+                                    : getAndReserveFirstUnusedBinding(DescSet);
+    lastOrderId = OrderId;
+    lastBindingNumber = NewBinding;
 
     SmallVector<Value *, 8> Args;
     Args.push_back(Builder.getInt32(DescSet));
@@ -142,6 +182,7 @@ bool SPIRVLegalizeImplicitBinding::runOnModule(Module &M) {
   if (ImplicitBindingCalls.empty()) {
     return false;
   }
+  verifyUniqueOrderIdPerResource(ImplicitBindingCalls);
 
   replaceImplicitBindingCalls(M);
   return true;
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index db85e33..53074ea 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -300,6 +300,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   getActionDefinitionsBuilder({G_STRICT_FSQRT,
                                G_FPOW,
                                G_FEXP,
+                               G_FMODF,
                                G_FEXP2,
                                G_FLOG,
                                G_FLOG2,
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index ad7e503..cf85691 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -27,7 +27,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
-cl::opt<bool> NoKernelInfoEndLTO(
+cl::opt<bool> llvm::NoKernelInfoEndLTO(
     "no-kernel-info-end-lto",
     cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"),
     cl::init(false), cl::Hidden);
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 2cfdc75..a068138 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -957,6 +957,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
 
 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
                                          EVT VT) const {
+  if (VT.isVector())
+    return VT.changeVectorElementType(MVT::i1);
   return MVT::i32;
 }
 
diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 2de89947..ea93a57 100644
--- a/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -136,6 +136,12 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
           .addReg(ZeroReg)
           .addReg(Reg0)
           .addImm(X86::sub_8bit);
+
+      // Redirect the debug-instr-number to the setcc.
+      if (unsigned InstrNum = ZExt->peekDebugInstrNum())
+        MF.makeDebugValueSubstitution({InstrNum, 0},
+                                      {MI.getDebugInstrNum(), 0});
+
       ToErase.push_back(ZExt);
     }
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 34854e4..cda5568 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52388,16 +52388,41 @@ static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
     // cannot take an immediate as its first operand.
     //
-    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
-        EFLAGS.getValueType().isInteger() &&
-        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-      SDValue NewSub =
-          DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
-                      EFLAGS.getOperand(1), EFLAGS.getOperand(0));
-      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+    // If EFLAGS is from a CMP that compares the same operands as the earlier
+    // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with
+    // SBB/ADC without creating a flipped SUB.
+    if (EFLAGS.getOpcode() == X86ISD::CMP &&
+        EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) {
       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
                          DAG.getVTList(VT, MVT::i32), X,
-                         DAG.getConstant(0, DL, VT), NewEFLAGS);
+                         DAG.getConstant(0, DL, VT), EFLAGS);
+    }
+
+    if (EFLAGS.getOpcode() == X86ISD::SUB &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      // Only create NewSub if we know one of the folds will succeed to avoid
+      // introducing a temporary node that may persist and affect one-use checks
+      // below.
+      if (EFLAGS.getNode()->hasOneUse()) {
+        SDValue NewSub = DAG.getNode(
+            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+            EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+        SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+        return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+                           DAG.getVTList(VT, MVT::i32), X,
+                           DAG.getConstant(0, DL, VT), NewEFLAGS);
+      }
+
+      if (IsSub && X == EFLAGS.getValue(0)) {
+        SDValue NewSub = DAG.getNode(
+            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+            EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+        SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+        return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32),
+                           EFLAGS.getOperand(0), EFLAGS.getOperand(1),
+                           NewEFLAGS);
+      }
     }
   }
 
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 83aa7de..28ee444 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -72,6 +72,7 @@ STATISTIC(NumImportedModules, "Number of modules imported from");
 STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
 STATISTIC(NumLiveSymbols, "Number of live symbols in index");
 
+namespace llvm {
 cl::opt<bool>
     ForceImportAll("force-import-all", cl::init(false), cl::Hidden,
                    cl::desc("Import functions with noinline attribute"));
@@ -185,9 +186,8 @@ static cl::opt<bool> CtxprofMoveRootsToOwnModule(
 
 extern cl::list<GlobalValue::GUID> MoveSymbolGUID;
 
-namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
-}
+} // end namespace llvm
 
 // Load lazily a module from \p FileName in \p Context.
 static std::unique_ptr<Module> loadFile(const std::string &FileName,
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 4f53738..150a2dc 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -28,10 +28,13 @@ using namespace llvm;
 
 STATISTIC(NumSpecsCreated, "Number of specializations created");
 
+namespace llvm {
+
 static cl::opt<bool> ForceSpecialization(
-    "force-specialization", cl::init(false), cl::Hidden, cl::desc(
-    "Force function specialization for every call site with a constant "
-    "argument"));
+    "force-specialization", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Force function specialization for every call site with a constant "
+        "argument"));
 
 static cl::opt<unsigned> MaxClones(
     "funcspec-max-clones", cl::init(3), cl::Hidden, cl::desc(
@@ -91,6 +94,8 @@ static cl::opt<bool> SpecializeLiteralConstant(
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB,
                                             BasicBlock *Succ) const {
   unsigned I = 0;
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 15f4d76..c4f1b68 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -214,11 +214,12 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion(
     "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
     cl::desc(
         "Require target function definition when promoting indirect calls"));
-} // namespace llvm
 
 extern cl::opt<bool> MemProfReportHintedSizes;
 extern cl::opt<unsigned> MinClonedColdBytePercent;
 
+} // namespace llvm
+
 namespace {
 /// CRTP base for graphs built from either IR or ThinLTO summary index.
 ///
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 99b8b88..e39e311 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -116,6 +116,8 @@ STATISTIC(
     NumCSInlinedHitGrowthLimit,
     "Number of functions with FDO inline stopped due to growth size limit");
 
+namespace llvm {
+
 // Command line option to specify the file to read samples from. This is
 // mainly used for debugging.
 static cl::opt<std::string> SampleProfileFile(
@@ -198,7 +200,6 @@ static cl::opt<bool> DisableSampleLoaderInlining(
         "pass, and merge (or scale) profiles (as configured by "
         "--sample-profile-merge-inlinee)."));
 
-namespace llvm {
 cl::opt<bool>
     SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
                     cl::desc("Sort profiled recursion by edge weights."));
diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index 093a39e..70b8614 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -23,6 +23,8 @@ using namespace sampleprof;
 
 #define DEBUG_TYPE "sample-profile-matcher"
 
+namespace llvm {
+
 static cl::opt<unsigned> FuncProfileSimilarityThreshold(
     "func-profile-similarity-threshold", cl::Hidden, cl::init(80),
     cl::desc("Consider a profile matches a function if the similarity of their "
@@ -55,6 +57,8 @@ static cl::opt<unsigned> SalvageStaleProfileMaxCallsites(
     cl::desc("The maximum number of callsites in a function, above which stale "
              "profile matching will be skipped."));
 
+} // end namespace llvm
+
 void SampleProfileMatcher::findIRAnchors(const Function &F,
                                          AnchorMap &IRAnchors) const {
   // For inlined code, recover the original callsite and callee by finding the
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 09bffa7..ac41fdd 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -120,6 +120,8 @@ STATISTIC(NumVirtConstProp1Bit,
           "Number of 1 bit virtual constant propagations");
 STATISTIC(NumVirtConstProp, "Number of virtual constant propagations");
 
+namespace llvm {
+
 static cl::opt<PassSummaryAction> ClSummaryAction(
     "wholeprogramdevirt-summary-action",
     cl::desc("What to do with the summary when running this pass"),
@@ -175,6 +177,8 @@ static cl::list<std::string>
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 /// With Clang, a pure virtual class's deleting destructor is emitted as a
 /// `llvm.trap` intrinsic followed by an unreachable IR instruction. In the
 /// context of whole program devirtualization, the deleting destructor of a pure
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index b6b3a95..87000a1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2934,32 +2934,6 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC,
   return nullptr;
 }
 
-static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy &Builder) {
-  FreezeInst *FI = dyn_cast<FreezeInst>(Sel.getCondition());
-  if (!FI)
-    return nullptr;
-
-  Value *Cond = FI->getOperand(0);
-  Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
-
-  //   select (freeze(x == y)), x, y --> y
-  //   select (freeze(x != y)), x, y --> x
-  // The freeze should be only used by this select. Otherwise, remaining uses of
-  // the freeze can observe a contradictory value.
-  //   c = freeze(x == y)   ; Let's assume that y = poison & x = 42; c is 0 or 1
-  //   a = select c, x, y   ;
-  //   f(a, c)              ; f(poison, 1) cannot happen, but if a is folded
-  //                        ; to y, this can happen.
-  CmpPredicate Pred;
-  if (FI->hasOneUse() &&
-      match(Cond, m_c_ICmp(Pred, m_Specific(TrueVal), m_Specific(FalseVal))) &&
-      (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) {
-    return Pred == ICmpInst::ICMP_EQ ? FalseVal : TrueVal;
-  }
-
-  return nullptr;
-}
-
 /// Given that \p CondVal is known to be \p CondIsTrue, try to simplify \p SI.
 static Value *simplifyNestedSelectsUsingImpliedCond(SelectInst &SI,
                                                     Value *CondVal,
@@ -4446,9 +4420,6 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (Instruction *PN = foldSelectToPhi(SI, DT, Builder))
     return replaceInstUsesWith(SI, PN);
 
-  if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder))
-    return replaceInstUsesWith(SI, Fr);
-
   if (Value *V = foldRoundUpIntegerWithPow2Alignment(SI, Builder))
     return replaceInstUsesWith(SI, V);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 5d2d79e..917004c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -132,9 +132,11 @@ STATISTIC(NumReassoc  , "Number of reassociations");
 DEBUG_COUNTER(VisitCounter, "instcombine-visit",
               "Controls which instructions are visited");
 
-static cl::opt<bool>
-EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
-                                              cl::init(true));
+namespace llvm {
+
+static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking",
+                                       cl::desc("Enable code sinking"),
+                                       cl::init(true));
 
 static cl::opt<unsigned> MaxSinkNumUsers(
     "instcombine-max-sink-users", cl::init(32),
@@ -156,6 +158,8 @@ extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
                                                cl::Hidden, cl::init(true));
 
+} // end namespace llvm
+
 std::optional<Instruction *>
 InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
   // Handle target specific intrinsics
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 0249f21..cf87e35 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -55,11 +55,11 @@ using namespace llvm;
 STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
 STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
 
+namespace llvm {
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
 
-namespace llvm {
 extern cl::opt<bool> EnableVTableProfileUse;
-}
+} // namespace llvm
 
 // Command line option to disable indirect-call promotion with the default as
 // false. This is for debug purpose.
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index cf076b9a..eff6f0c 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1923,20 +1923,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// Shadow = ParamTLS+ArgOffset.
   Value *getShadowPtrForArgument(IRBuilder<> &IRB, int ArgOffset) {
-    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
-    if (ArgOffset)
-      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, IRB.getPtrTy(0), "_msarg");
+    return IRB.CreatePtrAdd(MS.ParamTLS,
+                            ConstantInt::get(MS.IntptrTy, ArgOffset), "_msarg");
   }
 
   /// Compute the origin address for a given function argument.
   Value *getOriginPtrForArgument(IRBuilder<> &IRB, int ArgOffset) {
     if (!MS.TrackOrigins)
       return nullptr;
-    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
-    if (ArgOffset)
-      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, IRB.getPtrTy(0), "_msarg_o");
+    return IRB.CreatePtrAdd(MS.ParamOriginTLS,
+                            ConstantInt::get(MS.IntptrTy, ArgOffset),
+                            "_msarg_o");
   }
 
   /// Compute the shadow address for a retval.
@@ -7219,9 +7216,8 @@ struct VarArgHelperBase : public VarArgHelper {
 
   /// Compute the shadow address for a given va_arg.
   Value *getShadowPtrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) {
-    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_s");
+    return IRB.CreatePtrAdd(
+        MS.VAArgTLS, ConstantInt::get(MS.IntptrTy, ArgOffset), "_msarg_va_s");
   }
 
   /// Compute the shadow address for a given va_arg.
@@ -7235,12 +7231,12 @@ struct VarArgHelperBase : public VarArgHelper {
 
   /// Compute the origin address for a given va_arg.
   Value *getOriginPtrForVAArgument(IRBuilder<> &IRB, int ArgOffset) {
-    Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
     // getOriginPtrForVAArgument() is always called after
     // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never
     // overflow.
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_o");
+    return IRB.CreatePtrAdd(MS.VAArgOriginTLS,
+                            ConstantInt::get(MS.IntptrTy, ArgOffset),
+                            "_msarg_va_o");
   }
 
   void CleanUnusedTLS(IRBuilder<> &IRB, Value *ShadowBase,
@@ -7467,10 +7463,8 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
       NextNodeIRBuilder IRB(OrigInst);
       Value *VAListTag = OrigInst->getArgOperand(0);
 
-      Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
-          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                        ConstantInt::get(MS.IntptrTy, 16)),
-          MS.PtrTy);
+      Value *RegSaveAreaPtrPtr =
+          IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, 16));
       Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
       const Align Alignment = Align(16);
@@ -7482,10 +7476,8 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
       if (MS.TrackOrigins)
         IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
                          Alignment, AMD64FpEndOffset);
-      Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
-          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                        ConstantInt::get(MS.IntptrTy, 8)),
-          MS.PtrTy);
+      Value *OverflowArgAreaPtrPtr =
+          IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, 8));
       Value *OverflowArgAreaPtr =
           IRB.CreateLoad(MS.PtrTy, OverflowArgAreaPtrPtr);
       Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
@@ -7615,19 +7607,15 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
 
   // Retrieve a va_list field of 'void*' size.
   Value *getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) {
-    Value *SaveAreaPtrPtr = IRB.CreateIntToPtr(
-        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                      ConstantInt::get(MS.IntptrTy, offset)),
-        MS.PtrTy);
+    Value *SaveAreaPtrPtr =
+        IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, offset));
     return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr);
   }
 
   // Retrieve a va_list field of 'int' size.
   Value *getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) {
-    Value *SaveAreaPtr = IRB.CreateIntToPtr(
-        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                      ConstantInt::get(MS.IntptrTy, offset)),
-        MS.PtrTy);
+    Value *SaveAreaPtr =
+        IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, offset));
     Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr);
     return IRB.CreateSExt(SaveArea32, MS.IntptrTy);
   }
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index d9e850e..120c4f6 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -222,7 +222,6 @@ cl::opt<bool> NoPGOWarnMismatchComdatWeak(
     cl::desc("The option is used to turn on/off "
              "warnings about hash mismatch for comdat "
              "or weak functions."));
-} // namespace llvm
 
 // Command line option to enable/disable select instruction instrumentation.
 static cl::opt<bool>
@@ -347,7 +346,6 @@ cl::list<std::string> CtxPGOSkipCallsiteInstrument(
 
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
 
-namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
 extern cl::opt<PGOViewCountsType> PGOViewCounts;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 343bec3..a5f417a 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -54,6 +54,8 @@ using namespace llvm;
 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
 
+namespace llvm {
+
 // The minimum call count to optimize memory intrinsic calls.
 static cl::opt<unsigned>
     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000),
@@ -93,6 +95,8 @@ static cl::opt<unsigned>
     MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
                     cl::desc("Optimize the memop size <= this value"));
 
+} // end namespace llvm
+
 namespace {
 
 static const char *getMIName(const MemIntrinsic *MI) {
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index a3d4e53..0534fdd 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -21,7 +21,9 @@
 using namespace llvm;
 using CandidateInfo = ValueProfileCollector::CandidateInfo;
 
+namespace llvm {
 extern cl::opt<bool> MemOPOptMemcmpBcmp;
+} // end namespace llvm
 
 ///--------------------------- MemIntrinsicPlugin ------------------------------
 class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 2025fbb..3c14036e 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -26,6 +26,8 @@
 
 using namespace llvm;
 
+namespace llvm {
+
 static cl::opt<unsigned>
     JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden,
                            cl::desc("Only split jump tables with size less or "
@@ -43,6 +45,8 @@ static cl::opt<unsigned> FunctionSizeThreshold(
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 #define DEBUG_TYPE "jump-table-to-switch"
 
 namespace {
@@ -201,14 +205,12 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
   PostDominatorTree *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
   DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy);
   bool Changed = false;
-  InstrProfSymtab Symtab;
-  if (auto E = Symtab.create(*F.getParent()))
-    F.getContext().emitError(
-        "Could not create indirect call table, likely corrupted IR" +
-        toString(std::move(E)));
-  DenseMap<const Function *, GlobalValue::GUID> FToGuid;
-  for (const auto &[G, FPtr] : Symtab.getIDToNameMap())
-    FToGuid.insert({FPtr, G});
+  auto FuncToGuid = [&](const Function &Fct) {
+    if (Fct.getMetadata(AssignGUIDPass::GUIDMetadataName))
+      return AssignGUIDPass::getGUID(Fct);
+
+    return Function::getGUIDAssumingExternalLinkage(getIRPGOFuncName(F, InLTO));
+  };
 
   for (BasicBlock &BB : make_early_inc_range(F)) {
     BasicBlock *CurrentBB = &BB;
@@ -230,12 +232,8 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
         std::optional<JumpTableTy> JumpTable = parseJumpTable(GEP, PtrTy);
         if (!JumpTable)
           continue;
-        SplittedOutTail = expandToSwitch(
-            Call, *JumpTable, DTU, ORE, [&](const Function &Fct) {
-              if (Fct.getMetadata(AssignGUIDPass::GUIDMetadataName))
-                return AssignGUIDPass::getGUID(Fct);
-              return FToGuid.lookup_or(&Fct, 0U);
-            });
+        SplittedOutTail =
+            expandToSwitch(Call, *JumpTable, DTU, ORE, FuncToGuid);
         Changed = true;
         break;
       }
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index bab1f2a..9655173 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -116,6 +116,8 @@ STATISTIC(NumIntAssociationsHoisted,
 STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
                                     "reassociated and hoisted out of the loop");
 
+namespace llvm {
+
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
     DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
@@ -154,7 +156,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit(
 // which may not be precise, since optimizeUses is capped. The result is
 // correct, but we may not get as "far up" as possible to get which access is
 // clobbering the one queried.
-cl::opt<unsigned> llvm::SetLicmMssaOptCap(
+cl::opt<unsigned> SetLicmMssaOptCap(
     "licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
     cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
              "for faster compile. Caps the MemorySSA clobbering calls."));
@@ -162,7 +164,7 @@ cl::opt<unsigned> llvm::SetLicmMssaOptCap(
 // Experimentally, memory promotion carries less importance than sinking and
 // hoisting. Limit when we do promotion when using MemorySSA, in order to save
 // compile time.
-cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
+cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap(
     "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
     cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
              "effect. When MSSA in LICM is enabled, then this is the maximum "
@@ -171,6 +173,8 @@ cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedOrFoldableInLoop(const Instruction &I, const Loop *CurLoop,
                                       const LoopSafetyInfo *SafetyInfo,
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 1a9e16b..d31154f 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -17,6 +17,8 @@
 
 using namespace llvm;
 
+namespace llvm {
+
 /// Uses the "source_filename" instead of a Module hash ID for the suffix of
 /// promoted locals during LTO. NOTE: This requires that the source filename
 /// has a unique name / path to avoid name collisions.
@@ -35,6 +37,8 @@ cl::list<GlobalValue::GUID> MoveSymbolGUID(
         "used with the name of contextual profiling roots."),
     cl::Hidden);
 
+} // end namespace llvm
+
 FunctionImportGlobalProcessing::FunctionImportGlobalProcessing(
     Module &M, const ModuleSummaryIndex &Index,
     SetVector<GlobalValue *> *GlobalsToImport, bool ClearDSOLocalOnDeclarations)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 4d1f768..8bba634 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -95,7 +95,9 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "simplifycfg"
 
-cl::opt<bool> llvm::RequireAndPreserveDomTree(
+namespace llvm {
+
+cl::opt<bool> RequireAndPreserveDomTree(
     "simplifycfg-require-and-preserve-domtree", cl::Hidden,
 
     cl::desc(
@@ -205,6 +207,8 @@ static cl::opt<unsigned> MaxJumpThreadingLiveBlocks(
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5d6c81..7fa787b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7954,6 +7954,13 @@ bool VPRecipeBuilder::getScaledReductions(
   auto CollectExtInfo = [this, &Exts, &ExtOpTypes,
                          &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool {
     for (const auto &[I, OpI] : enumerate(Ops)) {
+      auto *CI = dyn_cast<ConstantInt>(OpI);
+      if (I > 0 && CI &&
+          canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) {
+        ExtOpTypes[I] = ExtOpTypes[0];
+        ExtKinds[I] = ExtKinds[0];
+        continue;
+      }
       Value *ExtOp;
       if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
         return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 02eb637..07b191a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1753,6 +1753,16 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
 }
 #endif
 
+bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+                                 TTI::PartialReductionExtendKind ExtKind) {
+  APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits());
+  unsigned WideSize = CI->getType()->getScalarSizeInBits();
+  APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
+                          ? TruncatedVal.sext(WideSize)
+                          : TruncatedVal.zext(WideSize);
+  return ExtendedVal == CI->getValue();
+}
+
 TargetTransformInfo::OperandValueInfo
 VPCostContext::getOperandInfo(VPValue *V) const {
   if (!V->isLiveIn())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fe59774..fc1a09e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -468,6 +468,10 @@ public:
 };
 #endif
 
+/// Check if a constant \p CI can be safely treated as having been extended
+/// from a narrower type with the given extension kind.
+bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+                           TTI::PartialReductionExtendKind ExtKind);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 46909a5..67b9244 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -340,6 +340,14 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
                                                  : Widen->getOperand(1));
     ExtAType = GetExtendKind(ExtAR);
     ExtBType = GetExtendKind(ExtBR);
+
+    if (!ExtBR && Widen->getOperand(1)->isLiveIn()) {
+      auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue());
+      if (canConstantBeExtended(CI, InputTypeA, ExtAType)) {
+        InputTypeB = InputTypeA;
+        ExtBType = ExtAType;
+      }
+    }
   };
 
   if (isa<VPWidenCastRecipe>(OpR)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a73b083..acdb379 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,7 +40,7 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
-cl::opt<bool> EnableWideActiveLaneMask(
+static cl::opt<bool> EnableWideActiveLaneMask(
     "enable-wide-lane-mask", cl::init(false), cl::Hidden,
     cl::desc("Enable use of wide get active lane mask instructions"));