62 files changed, 1096 insertions, 431 deletions
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index b78cc03e..f9bf092 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -281,6 +281,38 @@ static StructType *getOrCreateElementStruct(Type *ElemType, StringRef Name) {
   return StructType::create(ElemType, Name);
 }
 
+static Type *getTypeWithoutPadding(Type *Ty) {
+  // Recursively remove padding from structures.
+  if (auto *ST = dyn_cast<StructType>(Ty)) {
+    LLVMContext &Ctx = Ty->getContext();
+    SmallVector<Type *> ElementTypes;
+    ElementTypes.reserve(ST->getNumElements());
+    for (Type *ElTy : ST->elements()) {
+      if (isa<PaddingExtType>(ElTy))
+        continue;
+      ElementTypes.push_back(getTypeWithoutPadding(ElTy));
+    }
+
+    // Handle explicitly padded cbuffer arrays like { [ n x paddedty ], ty }
+    if (ElementTypes.size() == 2)
+      if (auto *AT = dyn_cast<ArrayType>(ElementTypes[0]))
+        if (ElementTypes[1] == AT->getElementType())
+          return ArrayType::get(ElementTypes[1], AT->getNumElements() + 1);
+
+    // If we only have a single element, don't wrap it in a struct.
+    if (ElementTypes.size() == 1)
+      return ElementTypes[0];
+
+    return StructType::get(Ctx, ElementTypes, /*IsPacked=*/false);
+  }
+  // Arrays just need to have their element type adjusted.
+  if (auto *AT = dyn_cast<ArrayType>(Ty))
+    return ArrayType::get(getTypeWithoutPadding(AT->getElementType()),
+                          AT->getNumElements());
+  // Anything else should be good as is.
+  return Ty;
+}
+
 StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) {
   SmallString<64> TypeName;
 
@@ -334,14 +366,21 @@ StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) {
   }
   case ResourceKind::CBuffer: {
     auto *RTy = cast<CBufferExtType>(HandleTy);
-    LayoutExtType *LayoutType = cast<LayoutExtType>(RTy->getResourceType());
-    StructType *Ty = cast<StructType>(LayoutType->getWrappedType());
     SmallString<64> Name = getResourceKindName(Kind);
     if (!CBufferName.empty()) {
       Name.append(".");
       Name.append(CBufferName);
     }
-    return StructType::create(Ty->elements(), Name);
+
+    // TODO: Remove this when we update the frontend to use explicit padding.
+    if (LayoutExtType *LayoutType =
+            dyn_cast<LayoutExtType>(RTy->getResourceType())) {
+      StructType *Ty = cast<StructType>(LayoutType->getWrappedType());
+      return StructType::create(Ty->elements(), Name);
+    }
+
+    return getOrCreateElementStruct(
+        getTypeWithoutPadding(RTy->getResourceType()), Name);
   }
   case ResourceKind::Sampler: {
     auto *RTy = cast<SamplerExtType>(HandleTy);
@@ -454,10 +493,10 @@ uint32_t ResourceTypeInfo::getCBufferSize(const DataLayout &DL) const {
 
   Type *ElTy = cast<CBufferExtType>(HandleTy)->getResourceType();
 
+  // TODO: Remove this when we update the frontend to use explicit padding.
   if (auto *LayoutTy = dyn_cast<LayoutExtType>(ElTy))
     return LayoutTy->getSize();
 
-  // TODO: What should we do with unannotated arrays?
   return DL.getTypeAllocSize(ElTy);
 }
 
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index a64b93d..442b9d1 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -4623,17 +4623,11 @@ const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V,
 
 /// If Expr computes ~A, return A else return nullptr
 static const SCEV *MatchNotExpr(const SCEV *Expr) {
-  const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr);
-  if (!Add || Add->getNumOperands() != 2 ||
-      !Add->getOperand(0)->isAllOnesValue())
-    return nullptr;
-
-  const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
-  if (!AddRHS || AddRHS->getNumOperands() != 2 ||
-      !AddRHS->getOperand(0)->isAllOnesValue())
-    return nullptr;
-
-  return AddRHS->getOperand(1);
+  const SCEV *MulOp;
+  if (match(Expr, m_scev_Add(m_scev_AllOnes(),
+                             m_scev_Mul(m_scev_AllOnes(), m_SCEV(MulOp)))))
+    return MulOp;
+  return nullptr;
 }
 
 /// Return a SCEV corresponding to ~V = -1-V
@@ -12220,12 +12214,11 @@ ScalarEvolution::computeConstantDifference(const SCEV *More, const SCEV *Less) {
     // Try to match a common constant multiply.
     auto MatchConstMul =
         [](const SCEV *S) -> std::optional<std::pair<const SCEV *, APInt>> {
-      auto *M = dyn_cast<SCEVMulExpr>(S);
-      if (!M || M->getNumOperands() != 2 ||
-          !isa<SCEVConstant>(M->getOperand(0)))
-        return std::nullopt;
-      return {
-          {M->getOperand(1), cast<SCEVConstant>(M->getOperand(0))->getAPInt()}};
+      const APInt *C;
+      const SCEV *Op;
+      if (match(S, m_scev_Mul(m_scev_APInt(C), m_SCEV(Op))))
+        return {{Op, *C}};
+      return std::nullopt;
     };
     if (auto MatchedMore = MatchConstMul(More)) {
       if (auto MatchedLess = MatchConstMul(Less)) {
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
index 25c1db9..ded4df4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
@@ -55,12 +55,10 @@ LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
 }
 
 LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
-                                                         LLT NewEltTy) {
+                                                         ElementCount EC) {
   return [=](const LegalityQuery &Query) {
     const LLT OldTy = Query.Types[TypeIdx];
-    ElementCount NewEltCount = NewEltTy.isVector() ? NewEltTy.getElementCount()
-                                                   : ElementCount::getFixed(1);
-    return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount));
+    return std::make_pair(TypeIdx, OldTy.changeElementCount(EC));
   };
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 055fdc6..ca82857 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -818,8 +818,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
   if (!DefMI)
     return false;
 
-  const TargetMachine& TM = DefMI->getMF()->getTarget();
-  if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath)
+  if (DefMI->getFlag(MachineInstr::FmNoNans))
     return true;
 
   // If the value is a constant, we can obviously see if it is a NaN or not.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0f2b518..cb0038c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3971,8 +3971,14 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) {
 }
 
 void SelectionDAGBuilder::visitPtrToAddr(const User &I) {
-  // FIXME: this is not correct for pointers with addr width != pointer width
-  visitPtrToInt(I);
+  SDValue N = getValue(I.getOperand(0));
+  // By definition the type of the ptrtoaddr must be equal to the address type.
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  EVT AddrVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  // The address width must be smaller or equal to the pointer representation
+  // width, so we lower ptrtoaddr as a truncate (possibly folded to a no-op).
+  N = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), AddrVT, N);
+  setValue(&I, N);
 }
 
 void SelectionDAGBuilder::visitPtrToInt(const User &I) {
diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index 0ffe3ae..f343925 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_component_library(LLVMOrcJIT
   SectCreate.cpp
   SelfExecutorProcessControl.cpp
   SimpleRemoteEPC.cpp
+  SimpleRemoteMemoryMapper.cpp
   Speculation.cpp
   SpeculateAnalyses.cpp
   ExecutorProcessControl.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
index 50e6b25..0833af7 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
@@ -57,16 +57,17 @@ public:
     std::swap(FR.Actions, G.allocActions());
 
     Parent.EPC.callSPSWrapperAsync<
-        rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
-        Parent.SAs.Finalize,
+        rt::SPSSimpleExecutorMemoryManagerInitializeSignature>(
+        Parent.SAs.Initialize,
         [OnFinalize = std::move(OnFinalize), AllocAddr = this->AllocAddr](
-            Error SerializationErr, Error FinalizeErr) mutable {
+            Error SerializationErr,
+            Expected<ExecutorAddr> InitializeKey) mutable {
           // FIXME: Release abandoned alloc.
           if (SerializationErr) {
-            cantFail(std::move(FinalizeErr));
+            cantFail(InitializeKey.takeError());
             OnFinalize(std::move(SerializationErr));
-          } else if (FinalizeErr)
-            OnFinalize(std::move(FinalizeErr));
+          } else if (!InitializeKey)
+            OnFinalize(InitializeKey.takeError());
           else
             OnFinalize(FinalizedAlloc(AllocAddr));
         },
@@ -76,8 +77,8 @@ public:
   void abandon(OnAbandonedFunction OnAbandoned) override {
     // FIXME: Return memory to pool instead.
     Parent.EPC.callSPSWrapperAsync<
-        rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
-        Parent.SAs.Deallocate,
+        rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
+        Parent.SAs.Release,
         [OnAbandoned = std::move(OnAbandoned)](Error SerializationErr,
                                                Error DeallocateErr) mutable {
           if (SerializationErr) {
@@ -123,9 +124,8 @@ void EPCGenericJITLinkMemoryManager::allocate(const JITLinkDylib *JD,
 
 void EPCGenericJITLinkMemoryManager::deallocate(
     std::vector<FinalizedAlloc> Allocs, OnDeallocatedFunction OnDeallocated) {
-  EPC.callSPSWrapperAsync<
-      rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
-      SAs.Deallocate,
+  EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
+      SAs.Release,
       [OnDeallocated = std::move(OnDeallocated)](Error SerErr,
                                                  Error DeallocErr) mutable {
         if (SerErr) {
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
index fec7062..cc72488 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
@@ -25,9 +25,9 @@ EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols(
   if (auto Err = EPC.getBootstrapSymbols(
           {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName},
            {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
-           {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
-           {SAs.Deallocate,
-            rt::SimpleExecutorMemoryManagerDeallocateWrapperName},
+           {SAs.Initialize,
+            rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+           {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName},
            {SAs.RegisterEHFrame, rt::RegisterEHFrameSectionAllocActionName},
            {SAs.DeregisterEHFrame,
             rt::DeregisterEHFrameSectionAllocActionName}}))
@@ -48,7 +48,7 @@ EPCGenericRTDyldMemoryManager::~EPCGenericRTDyldMemoryManager() {
 
   Error Err = Error::success();
   if (auto Err2 = EPC.callSPSWrapper<
-                  rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
+                  rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
           SAs.Reserve, Err, SAs.Instance, FinalizedAllocs)) {
     // FIXME: Report errors through EPC once that functionality is available.
     logAllUnhandledErrors(std::move(Err2), errs(), "");
@@ -267,10 +267,10 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
 
     // We'll also need to make an extra allocation for the eh-frame wrapper call
     // arguments.
-    Error FinalizeErr = Error::success();
+    Expected<ExecutorAddr> InitializeKey((ExecutorAddr()));
     if (auto Err = EPC.callSPSWrapper<
-                   rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
-            SAs.Finalize, FinalizeErr, SAs.Instance, std::move(FR))) {
+                   rt::SPSSimpleExecutorMemoryManagerInitializeSignature>(
+            SAs.Initialize, InitializeKey, SAs.Instance, std::move(FR))) {
       std::lock_guard<std::mutex> Lock(M);
       this->ErrMsg = toString(std::move(Err));
       dbgs() << "Serialization error: " << this->ErrMsg << "\n";
@@ -278,9 +278,9 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
         *ErrMsg = this->ErrMsg;
       return true;
     }
-    if (FinalizeErr) {
+    if (!InitializeKey) {
       std::lock_guard<std::mutex> Lock(M);
-      this->ErrMsg = toString(std::move(FinalizeErr));
+      this->ErrMsg = toString(InitializeKey.takeError());
       dbgs() << "Finalization error: " << this->ErrMsg << "\n";
       if (ErrMsg)
         *ErrMsg = this->ErrMsg;
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
index 26e8f53..cc99d3c 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -23,10 +23,12 @@ const char *SimpleExecutorMemoryManagerInstanceName =
     "__llvm_orc_SimpleExecutorMemoryManager_Instance";
 const char *SimpleExecutorMemoryManagerReserveWrapperName =
     "__llvm_orc_SimpleExecutorMemoryManager_reserve_wrapper";
-const char *SimpleExecutorMemoryManagerFinalizeWrapperName =
-    "__llvm_orc_SimpleExecutorMemoryManager_finalize_wrapper";
-const char *SimpleExecutorMemoryManagerDeallocateWrapperName =
-    "__llvm_orc_SimpleExecutorMemoryManager_deallocate_wrapper";
+const char *SimpleExecutorMemoryManagerInitializeWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_initialize_wrapper";
+const char *SimpleExecutorMemoryManagerDeinitializeWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_deinitialize_wrapper";
+const char *SimpleExecutorMemoryManagerReleaseWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_release_wrapper";
 
 const char *ExecutorSharedMemoryMapperServiceInstanceName =
     "__llvm_orc_ExecutorSharedMemoryMapperService_Instance";
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
index 87d7578..dec1df7 100644
--- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
@@ -216,9 +216,9 @@ SimpleRemoteEPC::createDefaultMemoryManager(SimpleRemoteEPC &SREPC) {
   if (auto Err = SREPC.getBootstrapSymbols(
           {{SAs.Allocator, rt::SimpleExecutorMemoryManagerInstanceName},
            {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
-           {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
-           {SAs.Deallocate,
-            rt::SimpleExecutorMemoryManagerDeallocateWrapperName}}))
+           {SAs.Initialize,
+            rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+           {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}}))
     return std::move(Err);
 
   return std::make_unique<EPCGenericJITLinkMemoryManager>(SREPC, SAs);
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
new file mode 100644
index 0000000..b82de3f
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
@@ -0,0 +1,104 @@
+//===---- SimpleRemoteMemoryMapper.cpp - Remote memory mapper ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h"
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+
+namespace llvm::orc {
+
+SimpleRemoteMemoryMapper::SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC,
+                                                   SymbolAddrs SAs)
+    : EPC(EPC), SAs(SAs) {}
+
+void SimpleRemoteMemoryMapper::reserve(size_t NumBytes,
+                                       OnReservedFunction OnReserved) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReserveSignature>(
+      SAs.Reserve,
+      [NumBytes, OnReserved = std::move(OnReserved)](
+          Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnReserved(std::move(SerializationErr));
+        }
+
+        if (Result)
+          OnReserved(ExecutorAddrRange(*Result, NumBytes));
+        else
+          OnReserved(Result.takeError());
+      },
+      SAs.Instance, static_cast<uint64_t>(NumBytes));
+}
+
+char *SimpleRemoteMemoryMapper::prepare(jitlink::LinkGraph &G,
+                                        ExecutorAddr Addr, size_t ContentSize) {
+  return G.allocateBuffer(ContentSize).data();
+}
+
+void SimpleRemoteMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
+                                          OnInitializedFunction OnInitialized) {
+
+  tpctypes::FinalizeRequest FR;
+
+  std::swap(FR.Actions, AI.Actions);
+  FR.Segments.reserve(AI.Segments.size());
+
+  for (auto Seg : AI.Segments)
+    FR.Segments.push_back({Seg.AG, AI.MappingBase + Seg.Offset,
+                           Seg.ContentSize + Seg.ZeroFillSize,
+                           ArrayRef<char>(Seg.WorkingMem, Seg.ContentSize)});
+
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapInitializeSignature>(
+      SAs.Initialize,
+      [OnInitialized = std::move(OnInitialized)](
+          Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnInitialized(std::move(SerializationErr));
+        }
+
+        OnInitialized(std::move(Result));
+      },
+      SAs.Instance, std::move(FR));
+}
+
+void SimpleRemoteMemoryMapper::deinitialize(
+    ArrayRef<ExecutorAddr> Allocations,
+    MemoryMapper::OnDeinitializedFunction OnDeinitialized) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>(
+      SAs.Deinitialize,
+      [OnDeinitialized = std::move(OnDeinitialized)](Error SerializationErr,
+                                                     Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnDeinitialized(std::move(SerializationErr));
+        }
+
+        OnDeinitialized(std::move(Result));
+      },
+      SAs.Instance, Allocations);
+}
+
+void SimpleRemoteMemoryMapper::release(ArrayRef<ExecutorAddr> Bases,
+                                       OnReleasedFunction OnReleased) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReleaseSignature>(
+      SAs.Release,
+      [OnReleased = std::move(OnReleased)](Error SerializationErr,
+                                           Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnReleased(std::move(SerializationErr));
+        }
+
+        return OnReleased(std::move(Result));
+      },
+      SAs.Instance, Bases);
+}
+
+} // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
index 3cdffb8..fe881a1 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h"
 
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -18,166 +19,167 @@ namespace orc {
 namespace rt_bootstrap {
 
 SimpleExecutorMemoryManager::~SimpleExecutorMemoryManager() {
-  assert(Allocations.empty() && "shutdown not called?");
+  assert(Slabs.empty() && "shutdown not called?");
 }
 
-Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) {
+Expected<ExecutorAddr> SimpleExecutorMemoryManager::reserve(uint64_t Size) {
   std::error_code EC;
   auto MB = sys::Memory::allocateMappedMemory(
       Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
   if (EC)
     return errorCodeToError(EC);
   std::lock_guard<std::mutex> Lock(M);
-  assert(!Allocations.count(MB.base()) && "Duplicate allocation addr");
-  Allocations[MB.base()].Size = Size;
+  assert(!Slabs.count(MB.base()) && "Duplicate allocation addr");
+  Slabs[MB.base()].Size = Size;
   return ExecutorAddr::fromPtr(MB.base());
 }
 
-Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
-  ExecutorAddr Base(~0ULL);
+Expected<ExecutorAddr>
+SimpleExecutorMemoryManager::initialize(tpctypes::FinalizeRequest &FR) {
   std::vector<shared::WrapperFunctionCall> DeallocationActions;
-  size_t SuccessfulFinalizationActions = 0;
 
   if (FR.Segments.empty()) {
-    // NOTE: Finalizing nothing is currently a no-op. Should it be an error?
     if (FR.Actions.empty())
-      return Error::success();
+      return make_error<StringError>("Finalization request is empty",
+                                     inconvertibleErrorCode());
     else
       return make_error<StringError>("Finalization actions attached to empty "
                                      "finalization request",
                                      inconvertibleErrorCode());
   }
 
-  for (auto &Seg : FR.Segments)
-    Base = std::min(Base, Seg.Addr);
-
-  for (auto &ActPair : FR.Actions)
-    if (ActPair.Dealloc)
-      DeallocationActions.push_back(ActPair.Dealloc);
-
-  // Get the Allocation for this finalization.
-  size_t AllocSize = 0;
-  {
-    std::lock_guard<std::mutex> Lock(M);
-    auto I = Allocations.find(Base.toPtr<void *>());
-    if (I == Allocations.end())
-      return make_error<StringError>("Attempt to finalize unrecognized "
-                                     "allocation " +
-                                         formatv("{0:x}", Base.getValue()),
-                                     inconvertibleErrorCode());
-    AllocSize = I->second.Size;
-    I->second.DeallocationActions = std::move(DeallocationActions);
-  }
-  ExecutorAddr AllocEnd = Base + ExecutorAddrDiff(AllocSize);
-
-  // Bail-out function: this will run deallocation actions corresponding to any
-  // completed finalization actions, then deallocate memory.
-  auto BailOut = [&](Error Err) {
-    std::pair<void *, Allocation> AllocToDestroy;
-
-    // Get allocation to destroy.
-    {
-      std::lock_guard<std::mutex> Lock(M);
-      auto I = Allocations.find(Base.toPtr<void *>());
-
-      // Check for missing allocation (effective a double free).
-      if (I == Allocations.end())
-        return joinErrors(
-            std::move(Err),
-            make_error<StringError>("No allocation entry found "
-                                    "for " +
-                                        formatv("{0:x}", Base.getValue()),
-                                    inconvertibleErrorCode()));
-      AllocToDestroy = std::move(*I);
-      Allocations.erase(I);
-    }
+  ExecutorAddrRange RR(FR.Segments.front().Addr, FR.Segments.front().Addr);
 
-    // Run deallocation actions for all completed finalization actions.
-    while (SuccessfulFinalizationActions)
-      Err =
-          joinErrors(std::move(Err), FR.Actions[--SuccessfulFinalizationActions]
-                                         .Dealloc.runWithSPSRetErrorMerged());
-
-    // Deallocate memory.
-    sys::MemoryBlock MB(AllocToDestroy.first, AllocToDestroy.second.Size);
-    if (auto EC = sys::Memory::releaseMappedMemory(MB))
-      Err = joinErrors(std::move(Err), errorCodeToError(EC));
-
-    return Err;
-  };
+  std::vector<sys::MemoryBlock> MBsToReset;
+  auto ResetMBs = make_scope_exit([&]() {
+    for (auto &MB : MBsToReset)
+      sys::Memory::protectMappedMemory(MB, sys::Memory::MF_READ |
+                                               sys::Memory::MF_WRITE);
+    sys::Memory::InvalidateInstructionCache(RR.Start.toPtr<void *>(),
+                                            RR.size());
+  });
 
   // Copy content and apply permissions.
   for (auto &Seg : FR.Segments) {
+    RR.Start = std::min(RR.Start, Seg.Addr);
+    RR.End = std::max(RR.End, Seg.Addr + Seg.Size);
 
     // Check segment ranges.
     if (LLVM_UNLIKELY(Seg.Size < Seg.Content.size()))
-      return BailOut(make_error<StringError>(
+      return make_error<StringError>(
           formatv("Segment {0:x} content size ({1:x} bytes) "
                   "exceeds segment size ({2:x} bytes)",
                   Seg.Addr.getValue(), Seg.Content.size(), Seg.Size),
-          inconvertibleErrorCode()));
+          inconvertibleErrorCode());
     ExecutorAddr SegEnd = Seg.Addr + ExecutorAddrDiff(Seg.Size);
-    if (LLVM_UNLIKELY(Seg.Addr < Base || SegEnd > AllocEnd))
-      return BailOut(make_error<StringError>(
+    if (LLVM_UNLIKELY(Seg.Addr < RR.Start || SegEnd > RR.End))
+      return make_error<StringError>(
           formatv("Segment {0:x} -- {1:x} crosses boundary of "
                   "allocation {2:x} -- {3:x}",
-                  Seg.Addr.getValue(), SegEnd.getValue(), Base.getValue(),
-                  AllocEnd.getValue()),
-          inconvertibleErrorCode()));
+                  Seg.Addr, SegEnd, RR.Start, RR.End),
+          inconvertibleErrorCode());
 
     char *Mem = Seg.Addr.toPtr<char *>();
     if (!Seg.Content.empty())
       memcpy(Mem, Seg.Content.data(), Seg.Content.size());
     memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size());
     assert(Seg.Size <= std::numeric_limits<size_t>::max());
+
+    sys::MemoryBlock MB(Mem, Seg.Size);
     if (auto EC = sys::Memory::protectMappedMemory(
-            {Mem, static_cast<size_t>(Seg.Size)},
-            toSysMemoryProtectionFlags(Seg.RAG.Prot)))
-      return BailOut(errorCodeToError(EC));
+            MB, toSysMemoryProtectionFlags(Seg.RAG.Prot)))
+      return errorCodeToError(EC);
+
+    MBsToReset.push_back(MB);
+
     if ((Seg.RAG.Prot & MemProt::Exec) == MemProt::Exec)
       sys::Memory::InvalidateInstructionCache(Mem, Seg.Size);
   }
 
-  // Run finalization actions.
-  for (auto &ActPair : FR.Actions) {
-    if (auto Err = ActPair.Finalize.runWithSPSRetErrorMerged())
-      return BailOut(std::move(Err));
-    ++SuccessfulFinalizationActions;
+  auto DeallocActions = runFinalizeActions(FR.Actions);
+  if (!DeallocActions)
+    return DeallocActions.takeError();
+
+  {
+    std::lock_guard<std::mutex> Lock(M);
+    auto Region = createRegionInfo(RR, "In initialize");
+    if (!Region)
+      return Region.takeError();
+    Region->DeallocActions = std::move(*DeallocActions);
   }
 
-  return Error::success();
+  // Successful initialization.
+  ResetMBs.release();
+
+  return RR.Start;
 }
 
-Error SimpleExecutorMemoryManager::deallocate(
-    const std::vector<ExecutorAddr> &Bases) {
-  std::vector<std::pair<void *, Allocation>> AllocPairs;
-  AllocPairs.reserve(Bases.size());
+Error SimpleExecutorMemoryManager::deinitialize(
+    const std::vector<ExecutorAddr> &InitKeys) {
+  Error Err = Error::success();
 
-  // Get allocation to destroy.
+  for (auto &KeyAddr : llvm::reverse(InitKeys)) {
+    std::vector<shared::WrapperFunctionCall> DeallocActions;
+    {
+      std::scoped_lock<std::mutex> Lock(M);
+      auto Slab = getSlabInfo(KeyAddr, "In deinitialize");
+      if (!Slab) {
+        Err = joinErrors(std::move(Err), Slab.takeError());
+        continue;
+      }
+
+      auto RI = getRegionInfo(*Slab, KeyAddr, "In deinitialize");
+      if (!RI) {
+        Err = joinErrors(std::move(Err), RI.takeError());
+        continue;
+      }
+
+      DeallocActions = std::move(RI->DeallocActions);
+    }
+
+    Err = joinErrors(std::move(Err),
+                     runDeallocActions(std::move(DeallocActions)));
+  }
+
+  return Err;
+}
+
+Error SimpleExecutorMemoryManager::release(
+    const std::vector<ExecutorAddr> &Bases) {
   Error Err = Error::success();
-  {
-    std::lock_guard<std::mutex> Lock(M);
-    for (auto &Base : Bases) {
-      auto I = Allocations.find(Base.toPtr<void *>());
-
-      // Check for missing allocation (effective a double free).
-      if (I != Allocations.end()) {
-        AllocPairs.push_back(std::move(*I));
-        Allocations.erase(I);
-      } else
+
+  // TODO: Prohibit new initializations within the slabs being removed?
+  for (auto &Base : llvm::reverse(Bases)) {
+    std::vector<shared::WrapperFunctionCall> DeallocActions;
+    sys::MemoryBlock MB;
+
+    {
+      std::scoped_lock<std::mutex> Lock(M);
+
+      auto SlabI = Slabs.find(Base.toPtr<void *>());
+      if (SlabI == Slabs.end()) {
         Err = joinErrors(
             std::move(Err),
-            make_error<StringError>("No allocation entry found "
-                                    "for " +
-                                        formatv("{0:x}", Base.getValue()),
+            make_error<StringError>("In release, " + formatv("{0:x}", Base) +
+                                        " is not part of any reserved "
+                                        "address range",
                                     inconvertibleErrorCode()));
+        continue;
+      }
+
+      auto &Slab = SlabI->second;
+
+      for (auto &[Addr, Region] : Slab.Regions)
+        llvm::copy(Region.DeallocActions, back_inserter(DeallocActions));
+
+      MB = {Base.toPtr<void *>(), Slab.Size};
+
+      Slabs.erase(SlabI);
     }
-  }
 
-  while (!AllocPairs.empty()) {
-    auto &P = AllocPairs.back();
-    Err = joinErrors(std::move(Err), deallocateImpl(P.first, P.second));
-    AllocPairs.pop_back();
+    Err = joinErrors(std::move(Err), runDeallocActions(DeallocActions));
+    if (auto EC = sys::Memory::releaseMappedMemory(MB))
+      Err = joinErrors(std::move(Err), errorCodeToError(EC));
   }
 
   return Err;
@@ -185,16 +187,15 @@ Error SimpleExecutorMemoryManager::deallocate(
 
 Error SimpleExecutorMemoryManager::shutdown() {
 
-  AllocationsMap AM;
+  // TODO: Prevent new allocations during shutdown.
+  std::vector<ExecutorAddr> Bases;
   {
-    std::lock_guard<std::mutex> Lock(M);
-    AM = std::move(Allocations);
+    std::scoped_lock<std::mutex> Lock(M);
+    for (auto &[Base, Slab] : Slabs)
+      Bases.push_back(ExecutorAddr::fromPtr(Base));
   }
 
-  Error Err = Error::success();
-  for (auto &KV : AM)
-    Err = joinErrors(std::move(Err), deallocateImpl(KV.first, KV.second));
-  return Err;
+  return release(Bases);
 }
 
 void SimpleExecutorMemoryManager::addBootstrapSymbols(
@@ -202,58 +203,150 @@ void SimpleExecutorMemoryManager::addBootstrapSymbols(
   M[rt::SimpleExecutorMemoryManagerInstanceName] = ExecutorAddr::fromPtr(this);
   M[rt::SimpleExecutorMemoryManagerReserveWrapperName] =
       ExecutorAddr::fromPtr(&reserveWrapper);
-  M[rt::SimpleExecutorMemoryManagerFinalizeWrapperName] =
-      ExecutorAddr::fromPtr(&finalizeWrapper);
-  M[rt::SimpleExecutorMemoryManagerDeallocateWrapperName] =
-      ExecutorAddr::fromPtr(&deallocateWrapper);
+  M[rt::SimpleExecutorMemoryManagerInitializeWrapperName] =
+      ExecutorAddr::fromPtr(&initializeWrapper);
+  M[rt::SimpleExecutorMemoryManagerDeinitializeWrapperName] =
+      ExecutorAddr::fromPtr(&deinitializeWrapper);
+  M[rt::SimpleExecutorMemoryManagerReleaseWrapperName] =
+      ExecutorAddr::fromPtr(&releaseWrapper);
 }
 
-Error SimpleExecutorMemoryManager::deallocateImpl(void *Base, Allocation &A) {
-  Error Err = Error::success();
+Expected<SimpleExecutorMemoryManager::SlabInfo &>
+SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddr A, StringRef Context) {
+  auto MakeBadSlabError = [&]() {
+    return make_error<StringError>(
+        Context + ", address " + formatv("{0:x}", A) +
+            " is not part of any reserved address range",
+        inconvertibleErrorCode());
+  };
 
-  while (!A.DeallocationActions.empty()) {
-    Err = joinErrors(std::move(Err),
-                     A.DeallocationActions.back().runWithSPSRetErrorMerged());
-    A.DeallocationActions.pop_back();
+  auto I = Slabs.upper_bound(A.toPtr<void *>());
+  if (I == Slabs.begin())
+    return MakeBadSlabError();
+  --I;
+  if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size)
+           .contains(A))
+    return MakeBadSlabError();
+
+  return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::SlabInfo &>
+SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddrRange R,
+                                         StringRef Context) {
+  auto MakeBadSlabError = [&]() {
+    return make_error<StringError>(
+        Context + ", range " + formatv("{0:x}", R) +
+            " is not part of any reserved address range",
+        inconvertibleErrorCode());
+  };
+
+  auto I = Slabs.upper_bound(R.Start.toPtr<void *>());
+  if (I == Slabs.begin())
+    return MakeBadSlabError();
+  --I;
+  if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size)
+           .contains(R))
+    return MakeBadSlabError();
+
+  return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::createRegionInfo(ExecutorAddrRange R,
+                                              StringRef Context) {
+
+  auto Slab = getSlabInfo(R, Context);
+  if (!Slab)
+    return Slab.takeError();
+
+  auto MakeBadRegionError = [&](ExecutorAddrRange Other, bool Prev) {
+    return make_error<StringError>(Context + ", region " + formatv("{0:x}", R) +
+                                       " overlaps " +
+                                       (Prev ? "previous" : "following") +
+                                       " region " + formatv("{0:x}", Other),
+                                   inconvertibleErrorCode());
+  };
+
+  auto I = Slab->Regions.upper_bound(R.Start);
+  if (I != Slab->Regions.begin()) {
+    auto J = std::prev(I);
+    ExecutorAddrRange PrevRange(J->first, J->second.Size);
+    if (PrevRange.overlaps(R))
+      return MakeBadRegionError(PrevRange, true);
+  }
+  if (I != Slab->Regions.end()) {
+    ExecutorAddrRange NextRange(I->first, I->second.Size);
+    if (NextRange.overlaps(R))
+      return MakeBadRegionError(NextRange, false);
   }
 
-  sys::MemoryBlock MB(Base, A.Size);
-  if (auto EC = sys::Memory::releaseMappedMemory(MB))
-    Err = joinErrors(std::move(Err), errorCodeToError(EC));
+  auto &RInfo = Slab->Regions[R.Start];
+  RInfo.Size = R.size();
+  return RInfo;
+}
 
-  return Err;
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::getRegionInfo(SlabInfo &Slab, ExecutorAddr A,
+                                           StringRef Context) {
+  auto I = Slab.Regions.find(A);
+  if (I == Slab.Regions.end())
+    return make_error<StringError>(
+        Context + ", address " + formatv("{0:x}", A) +
+            " does not correspond to the start of any initialized region",
+        inconvertibleErrorCode());
+
+  return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::getRegionInfo(ExecutorAddr A, StringRef Context) {
+  auto Slab = getSlabInfo(A, Context);
+  if (!Slab)
+    return Slab.takeError();
+
+  return getRegionInfo(*Slab, A, Context);
 }
 
 llvm::orc::shared::CWrapperFunctionResult
 SimpleExecutorMemoryManager::reserveWrapper(const char *ArgData,
                                             size_t ArgSize) {
-  return shared::WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerReserveSignature>::
+  return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReserveSignature>::
       handle(ArgData, ArgSize,
              shared::makeMethodWrapperHandler(
-                 &SimpleExecutorMemoryManager::allocate))
+                 &SimpleExecutorMemoryManager::reserve))
+          .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+SimpleExecutorMemoryManager::initializeWrapper(const char *ArgData,
+                                               size_t ArgSize) {
+  return shared::
+      WrapperFunction<rt::SPSSimpleRemoteMemoryMapInitializeSignature>::handle(
+             ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &SimpleExecutorMemoryManager::initialize))
           .release();
 }
 
 llvm::orc::shared::CWrapperFunctionResult
-SimpleExecutorMemoryManager::finalizeWrapper(const char *ArgData,
-                                             size_t ArgSize) {
+SimpleExecutorMemoryManager::deinitializeWrapper(const char *ArgData,
+                                                 size_t ArgSize) {
   return shared::WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>::
+             rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>::
       handle(ArgData, ArgSize,
              shared::makeMethodWrapperHandler(
-                 &SimpleExecutorMemoryManager::finalize))
+                 &SimpleExecutorMemoryManager::deinitialize))
           .release();
 }
 
 llvm::orc::shared::CWrapperFunctionResult
-SimpleExecutorMemoryManager::deallocateWrapper(const char *ArgData,
-                                               size_t ArgSize) {
-  return shared::WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>::
+SimpleExecutorMemoryManager::releaseWrapper(const char *ArgData,
+                                            size_t ArgSize) {
+  return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReleaseSignature>::
       handle(ArgData, ArgSize,
              shared::makeMethodWrapperHandler(
-                 &SimpleExecutorMemoryManager::deallocate))
+                 &SimpleExecutorMemoryManager::release))
           .release();
 }
 
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 9db48e8..0e9535d 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -1034,6 +1034,10 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
   }
 
   // DirectX resources
+  if (Name == "dx.Padding")
+    return TargetTypeInfo(
+        ArrayType::get(Type::getInt8Ty(C), Ty->getIntParameter(0)),
+        TargetExtType::CanBeGlobal);
   if (Name.starts_with("dx."))
     return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::CanBeGlobal,
                           TargetExtType::CanBeLocal,
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 421d6603..c3a27c9 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -488,6 +488,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_MACH_V5, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V55, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V60, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V61, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V62, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V65, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V66, EF_HEXAGON_MACH);
@@ -499,12 +500,21 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_MACH_V71T, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V73, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V75, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V77, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V79, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V81, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V83, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V85, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V87, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V89, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V91, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_ISA_V2, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V3, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V4, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V5, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V55, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V60, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V61, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V62, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V65, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V66, EF_HEXAGON_ISA);
@@ -514,6 +524,14 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_ISA_V71, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V73, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V75, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V77, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V79, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V81, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V83, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V85, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V87, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V89, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V91, EF_HEXAGON_ISA);
     break;
   case ELF::EM_AVR:
     BCaseMask(EF_AVR_ARCH_AVR1, EF_AVR_ARCH_MASK);
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index dad0fa3..648d6a5 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -354,8 +354,8 @@ namespace llvm {
 /// Emulates hitting "retry" from an "abort, retry, ignore" CRT debug report
 /// dialog. "retry" raises an exception which ultimately triggers our stack
 /// dumper.
-static LLVM_ATTRIBUTE_UNUSED int
-AvoidMessageBoxHook(int ReportType, char *Message, int *Return) {
+[[maybe_unused]] static int AvoidMessageBoxHook(int ReportType, char *Message,
+                                                int *Return) {
   // Set *Return to the retry code for the return value of _CrtDbgReport:
   // http://msdn.microsoft.com/en-us/library/8hyw4sy7(v=vs.71).aspx
   // This may also trigger just-in-time debugging via DebugBreak().
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index be2f2e4..662d84b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1561,6 +1561,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
@@ -1717,6 +1718,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
       setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
@@ -7775,6 +7777,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::VECREDUCE_FMAXIMUM:
   case ISD::VECREDUCE_FMINIMUM:
     return LowerVECREDUCE(Op, DAG);
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_FMUL:
+    return LowerVECREDUCE_MUL(Op, DAG);
   case ISD::ATOMIC_LOAD_AND:
     return LowerATOMIC_LOAD_AND(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
@@ -16794,6 +16799,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
   }
 }
 
+SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  assert(SrcVT.isScalableVector() && "Unexpected operand type!");
+
+  SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
+  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
+  SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
+
+  // Whilst we don't know the size of the vector we do know the maximum size so
+  // can perform a tree reduction with an identity vector, which means once we
+  // arrive at the result the remaining stages (when the vector is smaller than
+  // the maximum) have no affect.
+
+  unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+  unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
+
+  for (unsigned I = 0; I < Stages; ++I) {
+    Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
+    Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
+  }
+
+  return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
+}
+
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                                                     SelectionDAG &DAG) const {
   auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
@@ -18144,8 +18176,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
-  if (Factor != 2 && Factor != 4) {
-    LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
+  if (Factor != 2 && Factor != 3 && Factor != 4) {
+    LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
     return false;
   }
   auto *LI = dyn_cast<LoadInst>(Load);
@@ -18223,8 +18255,8 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     Instruction *Store, Value *Mask,
     ArrayRef<Value *> InterleavedValues) const {
   unsigned Factor = InterleavedValues.size();
-  if (Factor != 2 && Factor != 4) {
-    LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
+  if (Factor != 2 && Factor != 3 && Factor != 4) {
+    LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
     return false;
   }
   StoreInst *SI = dyn_cast<StoreInst>(Store);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fd..9495c9f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -752,6 +752,7 @@ private:
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 30dfcf2b..12c600f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -10600,6 +10600,9 @@ describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
   Register DestReg = DestSrc->Destination->getReg();
   Register SrcReg = DestSrc->Source->getReg();
 
+  if (!DestReg.isValid() || !SrcReg.isValid())
+    return std::nullopt;
+
   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
 
   // If the described register is the destination, just return the source.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index bfe2c80..a67b12a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -901,6 +901,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
+  addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
   addRulesForGOpcs({G_FADD}, Standard)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index be62395..e0375ea 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -513,8 +513,7 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
     }
 
     if (Imm == AMDGPU::EncValues::LITERAL_CONST) {
-      Op = decodeLiteralConstant(
-          Desc, OpDesc, OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64);
+      Op = decodeLiteralConstant(Desc, OpDesc);
       continue;
     }
 
@@ -1545,21 +1544,21 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
 MCOperand
 AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const {
   if (HasLiteral) {
-    if (Literal64 != Val)
+    if (Literal != Val)
       return errOperand(Val, "More than one unique literal is illegal");
   }
   HasLiteral = true;
-  Literal = Literal64 = Val;
+  Literal = Val;
 
-  bool UseLit64 = Hi_32(Literal64) == 0;
+  bool UseLit64 = Hi_32(Literal) == 0;
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
-                        LitModifier::Lit64, Literal64, getContext()))
-                  : MCOperand::createImm(Literal64);
+                        LitModifier::Lit64, Literal, getContext()))
+                  : MCOperand::createImm(Literal);
 }
 
-MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
-                                                    const MCOperandInfo &OpDesc,
-                                                    bool ExtendFP64) const {
+MCOperand
+AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
+                                          const MCOperandInfo &OpDesc) const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
   // ToDo: deal with float/double constants
@@ -1569,35 +1568,79 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
                         Twine(Bytes.size()));
     }
     HasLiteral = true;
-    Literal = Literal64 = eatBytes<uint32_t>(Bytes);
-    if (ExtendFP64)
-      Literal64 <<= 32;
+    Literal = eatBytes<uint32_t>(Bytes);
   }
 
-  int64_t Val = ExtendFP64 ? Literal64 : Literal;
+  // For disassembling always assume all inline constants are available.
+  bool HasInv2Pi = true;
 
-  bool CanUse64BitLiterals =
-      STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
-      !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
-
-  bool UseLit64 = false;
-  if (CanUse64BitLiterals) {
-    if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
-        OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64)
-      UseLit64 = false;
-    else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
-             OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
-             OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64)
-      UseLit64 = Hi_32(Literal64) == 0;
+  // Invalid instruction codes may contain literals for inline-only
+  // operands, so we support them here as well.
+  int64_t Val = Literal;
+  bool UseLit = false;
+  switch (OpDesc.OperandType) {
+  default:
+    llvm_unreachable("Unexpected operand type!");
+  case AMDGPU::OPERAND_REG_IMM_BF16:
+  case AMDGPU::OPERAND_REG_INLINE_C_BF16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
+    UseLit = AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2BF16:
+    UseLit = AMDGPU::isInlinableLiteralV2BF16(Val);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    UseLit = AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+    UseLit = AMDGPU::isInlinableLiteralV2F16(Val);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+    break;
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+    UseLit = AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+    UseLit = AMDGPU::isInlinableLiteralV2I16(Val);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+  case AMDGPU::OPERAND_REG_IMM_V2FP32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_KIMM32:
+    UseLit = AMDGPU::isInlinableLiteral32(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+    Val <<= 32;
+    break;
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    UseLit = AMDGPU::isInlinableLiteral64(Val, HasInv2Pi);
+    break;
+  case MCOI::OPERAND_REGISTER:
+    // TODO: Disassembling V_DUAL_FMAMK_F32_X_FMAMK_F32_gfx11 hits
+    // decoding a literal in a position of a register operand. Give
+    // it special handling in the caller, decodeImmOperands(), instead
+    // of quietly allowing it here.
+    break;
   }
 
-  return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
-                        LitModifier::Lit64, Val, getContext()))
-                  : MCOperand::createImm(Val);
+  return UseLit ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
+                      LitModifier::Lit, Val, getContext()))
+                : MCOperand::createImm(Val);
 }
 
-MCOperand
-AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
+MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const {
   assert(STI.hasFeature(AMDGPU::Feature64BitLiterals));
 
   if (!HasLiteral) {
@@ -1606,25 +1649,13 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
                                Twine(Bytes.size()));
     }
     HasLiteral = true;
-    Literal64 = eatBytes<uint64_t>(Bytes);
-  }
-
-  bool UseLit64 = false;
-  const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
-  const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()];
-  if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
-      OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) {
-    UseLit64 = false;
-  } else {
-    assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
-           OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
-           OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64);
-    UseLit64 = Hi_32(Literal64) == 0;
+    Literal = eatBytes<uint64_t>(Bytes);
   }
 
+  bool UseLit64 = Hi_32(Literal) == 0;
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
-                        LitModifier::Lit64, Literal64, getContext()))
-                  : MCOperand::createImm(Literal64);
+                        LitModifier::Lit64, Literal, getContext()))
+                  : MCOperand::createImm(Literal);
 }
 
 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -1913,7 +1944,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const MCInst &Inst,
     return MCOperand::createImm(Val);
 
   if (Val == LITERAL64_CONST && STI.hasFeature(AMDGPU::Feature64BitLiterals)) {
-    return decodeLiteral64Constant(Inst);
+    return decodeLiteral64Constant();
   }
 
   switch (Width) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 2751857..d103d79 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -44,8 +44,7 @@ private:
   const unsigned HwModeRegClass;
   const unsigned TargetMaxInstBytes;
   mutable ArrayRef<uint8_t> Bytes;
-  mutable uint32_t Literal;
-  mutable uint64_t Literal64;
+  mutable uint64_t Literal;
   mutable bool HasLiteral;
   mutable std::optional<bool> EnableWavefrontSize32;
   unsigned CodeObjectVersion;
@@ -144,9 +143,8 @@ public:
   MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
   MCOperand decodeMandatoryLiteral64Constant(uint64_t Imm) const;
   MCOperand decodeLiteralConstant(const MCInstrDesc &Desc,
-                                  const MCOperandInfo &OpDesc,
-                                  bool ExtendFP64) const;
-  MCOperand decodeLiteral64Constant(const MCInst &Inst) const;
+                                  const MCOperandInfo &OpDesc) const;
+  MCOperand decodeLiteral64Constant() const;
 
   MCOperand decodeSrcOp(const MCInst &Inst, unsigned Width, unsigned Val) const;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 58482ea..9fbf9e5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,12 @@ static cl::opt<bool> GCNTrackers(
     cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
     cl::init(false));
 
+static cl::opt<unsigned> PendingQueueLimit(
+    "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
+    cl::desc(
+        "Max (Available+Pending) size to inspect pending queue (0 disables)"),
+    cl::init(256));
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 #define DUMP_MAX_REG_PRESSURE
 static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
@@ -335,17 +341,52 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
   }
 }
 
+static bool shouldCheckPending(SchedBoundary &Zone,
+                               const TargetSchedModel *SchedModel) {
+  bool HasBufferedModel =
+      SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
+  unsigned Combined = Zone.Available.size() + Zone.Pending.size();
+  return Combined <= PendingQueueLimit && HasBufferedModel;
+}
+
+static SUnit *pickOnlyChoice(SchedBoundary &Zone,
+                             const TargetSchedModel *SchedModel) {
+  // pickOnlyChoice() releases pending instructions and checks for new hazards.
+  SUnit *OnlyChoice = Zone.pickOnlyChoice();
+  if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
+    return OnlyChoice;
+
+  return nullptr;
+}
+
+void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
+                                              const SchedCandidate &Preferred) {
+  LLVM_DEBUG({
+    dbgs() << "Prefer:\t\t";
+    DAG->dumpNode(*Preferred.SU);
+
+    if (Current.SU) {
+      dbgs() << "Not:\t";
+      DAG->dumpNode(*Current.SU);
+    }
+
+    dbgs() << "Reason:\t\t";
+    traceCandidate(Preferred);
+  });
+}
+
 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeFromQueue()
 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          const CandPolicy &ZonePolicy,
                                          const RegPressureTracker &RPTracker,
-                                         SchedCandidate &Cand,
+                                         SchedCandidate &Cand, bool &IsPending,
                                          bool IsBottomUp) {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
+  IsPending = false;
   if (DAG->isTrackingPressure()) {
     if (!GCNTrackers) {
       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
@@ -358,8 +399,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       VGPRPressure = T->getPressure().getArchVGPRNum();
     }
   }
-  ReadyQueue &Q = Zone.Available;
-  for (SUnit *SU : Q) {
+  LLVM_DEBUG(dbgs() << "Available Q:\n");
+  ReadyQueue &AQ = Zone.Available;
+  for (SUnit *SU : AQ) {
 
     SchedCandidate TryCand(ZonePolicy);
     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
@@ -371,27 +413,55 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       // Initialize resource delta if needed in case future heuristics query it.
       if (TryCand.ResDelta == SchedResourceDelta())
         TryCand.initResourceDelta(Zone.DAG, SchedModel);
+      LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
       Cand.setBest(TryCand);
-      LLVM_DEBUG(traceCandidate(Cand));
+    } else {
+      printCandidateDecision(TryCand, Cand);
+    }
+  }
+
+  if (!shouldCheckPending(Zone, SchedModel))
+    return;
+
+  LLVM_DEBUG(dbgs() << "Pending Q:\n");
+  ReadyQueue &PQ = Zone.Pending;
+  for (SUnit *SU : PQ) {
+
+    SchedCandidate TryCand(ZonePolicy);
+    initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
+                  VGPRPressure, IsBottomUp);
+    // Pass SchedBoundary only when comparing nodes from the same boundary.
+    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+    tryPendingCandidate(Cand, TryCand, ZoneArg);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(Zone.DAG, SchedModel);
+      LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
+      IsPending = true;
+      Cand.setBest(TryCand);
+    } else {
+      printCandidateDecision(TryCand, Cand);
     }
   }
 }
 
 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeBidirectional()
-SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
+                                               bool &PickedPending) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
-  if (SUnit *SU = Bot.pickOnlyChoice()) {
+  if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
     IsTopNode = false;
     return SU;
   }
-  if (SUnit *SU = Top.pickOnlyChoice()) {
+  if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
     IsTopNode = true;
     return SU;
   }
-  // Set the bottom-up policy based on the state of the current bottom zone and
-  // the instructions outside the zone, including the top zone.
+  // Set the bottom-up policy based on the state of the current bottom zone
+  // and the instructions outside the zone, including the top zone.
   CandPolicy BotPolicy;
   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
   // Set the top-down policy based on the state of the current top zone and
@@ -399,12 +469,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   CandPolicy TopPolicy;
   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
 
+  bool BotPending = false;
   // See if BotCand is still valid (because we previously scheduled from Top).
   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
       BotCand.Policy != BotPolicy) {
     BotCand.reset(CandPolicy());
     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
+                      BotPending,
                       /*IsBottomUp=*/true);
     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
   } else {
@@ -414,6 +486,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       SchedCandidate TCand;
       TCand.reset(CandPolicy());
       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
+                        BotPending,
                         /*IsBottomUp=*/true);
       assert(TCand.SU == BotCand.SU &&
              "Last pick result should correspond to re-picking right now");
@@ -421,12 +494,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
 #endif
   }
 
+  bool TopPending = false;
   // Check if the top Q has a better candidate.
   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
       TopCand.Policy != TopPolicy) {
     TopCand.reset(CandPolicy());
     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
+                      TopPending,
                       /*IsBottomUp=*/false);
     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
   } else {
@@ -436,6 +511,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       SchedCandidate TCand;
       TCand.reset(CandPolicy());
       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
+                        TopPending,
                         /*IsBottomUp=*/false);
       assert(TCand.SU == TopCand.SU &&
              "Last pick result should correspond to re-picking right now");
@@ -446,12 +522,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   // Pick best from BotCand and TopCand.
   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
-  SchedCandidate Cand = BotCand;
-  TopCand.Reason = NoCand;
-  tryCandidate(Cand, TopCand, nullptr);
-  if (TopCand.Reason != NoCand) {
-    Cand.setBest(TopCand);
+  SchedCandidate Cand = BotPending ? TopCand : BotCand;
+  SchedCandidate TryCand = BotPending ? BotCand : TopCand;
+  PickedPending = BotPending && TopPending;
+
+  TryCand.Reason = NoCand;
+  if (BotPending || TopPending) {
+    PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
+  } else {
+    tryCandidate(Cand, TryCand, nullptr);
+  }
+
+  if (TryCand.Reason != NoCand) {
+    Cand.setBest(TryCand);
   }
+
   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
 
   IsTopNode = Cand.AtTop;
@@ -466,35 +551,55 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
     return nullptr;
   }
+  bool PickedPending;
   SUnit *SU;
   do {
+    PickedPending = false;
     if (RegionPolicy.OnlyTopDown) {
-      SU = Top.pickOnlyChoice();
+      SU = pickOnlyChoice(Top, SchedModel);
       if (!SU) {
         CandPolicy NoPolicy;
         TopCand.reset(NoPolicy);
         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+                          PickedPending,
                           /*IsBottomUp=*/false);
         assert(TopCand.Reason != NoCand && "failed to find a candidate");
         SU = TopCand.SU;
       }
       IsTopNode = true;
     } else if (RegionPolicy.OnlyBottomUp) {
-      SU = Bot.pickOnlyChoice();
+      SU = pickOnlyChoice(Bot, SchedModel);
       if (!SU) {
         CandPolicy NoPolicy;
         BotCand.reset(NoPolicy);
         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
+                          PickedPending,
                           /*IsBottomUp=*/true);
         assert(BotCand.Reason != NoCand && "failed to find a candidate");
         SU = BotCand.SU;
       }
       IsTopNode = false;
     } else {
-      SU = pickNodeBidirectional(IsTopNode);
+      SU = pickNodeBidirectional(IsTopNode, PickedPending);
     }
   } while (SU->isScheduled);
 
+  if (PickedPending) {
+    unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
+    SchedBoundary &Zone = IsTopNode ? Top : Bot;
+    unsigned CurrentCycle = Zone.getCurrCycle();
+    if (ReadyCycle > CurrentCycle)
+      Zone.bumpCycle(ReadyCycle);
+
+    // FIXME: checkHazard() doesn't give information about which cycle the
+    // hazard will resolve so just keep bumping the cycle by 1. This could be
+    // made more efficient if checkHazard() returned more details.
+    while (Zone.checkHazard(SU))
+      Zone.bumpCycle(Zone.getCurrCycle() + 1);
+
+    Zone.releasePending();
+  }
+
   if (SU->isTopReady())
     Top.removeReady(SU);
   if (SU->isBottomReady())
@@ -540,6 +645,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
   return *std::next(CurrentStage);
 }
 
+bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
+                                           SchedCandidate &TryCand,
+                                           SchedBoundary *Zone) const {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Bias PhysReg Defs and copies to their uses and defined respectively.
+  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+    return TryCand.Reason != NoCand;
+
+  // Avoid exceeding the target's limit.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+                  RegExcess, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+                  TryCand, Cand, RegCritical, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return TryCand.Reason != NoCand;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources, TryCand, Cand,
+                   ResourceDemand))
+      return TryCand.Reason != NoCand;
+  }
+
+  return false;
+}
+
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 8ea4267..975781f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -44,17 +44,32 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
 /// heuristics to determine excess/critical pressure sets.
 class GCNSchedStrategy : public GenericScheduler {
 protected:
-  SUnit *pickNodeBidirectional(bool &IsTopNode);
+  SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending);
 
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
                          const RegPressureTracker &RPTracker,
-                         SchedCandidate &Cand, bool IsBottomUp);
+                         SchedCandidate &Cand, bool &IsPending,
+                         bool IsBottomUp);
 
   void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
                      const RegPressureTracker &RPTracker,
                      const SIRegisterInfo *SRI, unsigned SGPRPressure,
                      unsigned VGPRPressure, bool IsBottomUp);
 
+  /// Evaluates instructions in the pending queue using a subset of scheduling
+  /// heuristics.
+  ///
+  /// Instructions that cannot be issued due to hardware constraints are placed
+  /// in the pending queue rather than the available queue, making them normally
+  /// invisible to scheduling heuristics. However, in certain scenarios (such as
+  /// avoiding register spilling), it may be beneficial to consider scheduling
+  /// these not-yet-ready instructions.
+  bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                           SchedBoundary *Zone) const;
+
+  void printCandidateDecision(const SchedCandidate &Current,
+                              const SchedCandidate &Preferred);
+
   std::vector<unsigned> Pressure;
 
   std::vector<unsigned> MaxPressure;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a01a5fd..5e3195b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1697,9 +1697,6 @@ LLVM_READNONE
 bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
-bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi);
-
-LLVM_READNONE
 bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 228114c..44c4830 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -57,6 +57,7 @@ def ResBindTy : DXILOpParamType;
 def ResPropsTy : DXILOpParamType;
 def SplitDoubleTy : DXILOpParamType;
 def BinaryWithCarryTy : DXILOpParamType;
+def DimensionsTy : DXILOpParamType;
 
 class DXILOpClass;
 
@@ -901,6 +902,13 @@ def CheckAccessFullyMapped : DXILOp<71, checkAccessFullyMapped> {
   let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
 }
 
+def GetDimensions : DXILOp<72, getDimensions> {
+  let Doc = "gets the dimensions of a buffer or texture";
+  let arguments = [HandleTy, Int32Ty];
+  let result = DimensionsTy;
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def Barrier : DXILOp<80, barrier> {
   let Doc = "inserts a memory barrier in the shader";
   let intrinsics = [
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 1aed8f9..944b2e6 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -261,6 +261,12 @@ static StructType *getBinaryWithCarryType(LLVMContext &Context) {
   return StructType::create({Int32Ty, Int1Ty}, "dx.types.i32c");
 }
 
+static StructType *getDimensionsType(LLVMContext &Ctx) {
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  return getOrCreateStructType("dx.types.Dimensions",
+                               {Int32Ty, Int32Ty, Int32Ty, Int32Ty}, Ctx);
+}
+
 static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
                                     Type *OverloadTy) {
   switch (Kind) {
@@ -318,6 +324,8 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
     return getSplitDoubleType(Ctx);
   case OpParamType::BinaryWithCarryTy:
     return getBinaryWithCarryType(Ctx);
+  case OpParamType::DimensionsTy:
+    return getDimensionsType(Ctx);
   }
   llvm_unreachable("Invalid parameter kind");
   return nullptr;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 610d8b6..e46a393 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -627,6 +627,28 @@ public:
     });
   }
 
+  [[nodiscard]] bool lowerGetDimensionsX(Function &F) {
+    IRBuilder<> &IRB = OpBuilder.getIRB();
+    Type *Int32Ty = IRB.getInt32Ty();
+
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
+      IRB.SetInsertPoint(CI);
+      Value *Handle =
+          createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
+      Value *Undef = UndefValue::get(Int32Ty);
+
+      Expected<CallInst *> OpCall = OpBuilder.tryCreateOp(
+          OpCode::GetDimensions, {Handle, Undef}, CI->getName(), Int32Ty);
+      if (Error E = OpCall.takeError())
+        return E;
+      Value *Dim = IRB.CreateExtractValue(*OpCall, 0);
+
+      CI->replaceAllUsesWith(Dim);
+      CI->eraseFromParent();
+      return Error::success();
+    });
+  }
+
   [[nodiscard]] bool lowerGetPointer(Function &F) {
     // These should have already been handled in DXILResourceAccess, so we can
     // just clean up the dead prototype.
@@ -934,6 +956,9 @@ public:
       case Intrinsic::dx_resource_updatecounter:
         HasErrors |= lowerUpdateCounter(F);
         break;
+      case Intrinsic::dx_resource_getdimensions_x:
+        HasErrors |= lowerGetDimensionsX(F);
+        break;
       case Intrinsic::ctpop:
         HasErrors |= lowerCtpopToCountBits(F);
         break;
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 5f180d6..3bd6ed4 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -66,6 +66,10 @@ public:
 
   void remapInstruction(MCInst &Instr) const;
 
+  Expected<bool> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
+                               ArrayRef<uint8_t> Bytes,
+                               uint64_t Address) const override;
+
 private:
   bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
                   uint64_t &BytesToSkip, raw_ostream &CS) const;
@@ -567,6 +571,18 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
   return Result;
 }
 
+Expected<bool> HexagonDisassembler::onSymbolStart(SymbolInfoTy &Symbol,
+                                                  uint64_t &Size,
+                                                  ArrayRef<uint8_t> Bytes,
+                                                  uint64_t Address) const {
+  // At the start of a symbol, force a fresh packet by resetting any
+  // in-progress bundle state. This prevents packets from straddling label
+  // boundaries when data (e.g. jump tables) appears in between.
+  Size = 0;
+  resetBundle();
+  return true;
+}
+
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
                                         ArrayRef<MCPhysReg> Table) {
   if (RegNo < Table.size()) {
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 1ce8d7e3..df0c8c1 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -264,9 +264,10 @@ public:
 
 } // end anonymous namespace
 
-static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
-                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                    Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                     CCValAssign::LocInfo LocInfo,
+                                     ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                     CCState &State);
 
 static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
                             CCValAssign::LocInfo LocInfo,
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7f1ff45..2fd7327 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -3176,9 +3176,10 @@ static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
                     F64Regs);
 }
 
-static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
-                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                       Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                        CCValAssign::LocInfo LocInfo,
+                                        ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                        CCState &State);
 
 #include "MipsGenCallingConv.inc"
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 662d3f6..b1794b7 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -717,6 +717,18 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .clampScalar(0, sXLen, sXLen)
       .lower();
 
+  LegalityPredicate InsertVectorEltPred = [=](const LegalityQuery &Query) {
+    LLT VecTy = Query.Types[0];
+    LLT EltTy = Query.Types[1];
+    return VecTy.getElementType() == EltTy;
+  };
+
+  getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
+      .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+                   InsertVectorEltPred, typeIs(2, sXLen)))
+      .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), InsertVectorEltPred,
+                   typeIs(2, sXLen)));
+
   getLegacyLegalizerInfo().computeTables();
   verify(*ST.getInstrInfo());
 }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7123a2d..eb87558 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1672,6 +1672,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.useRVVForFixedLengthVectors())
     setTargetDAGCombine(ISD::BITCAST);
 
+  setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
+
   // Disable strict node mutation.
   IsStrictFPEnabled = true;
   EnableExtLdPromotion = true;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 0f6e1ca..ed54404d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1763,6 +1763,26 @@ defm RELAXED_DOT :
             "i16x8.relaxed_dot_i8x16_i7x16_s\t$dst, $lhs, $rhs",
             "i16x8.relaxed_dot_i8x16_i7x16_s", 0x112>;
 
+def : Pat<
+  (v8i16 (add
+    (wasm_shuffle
+      (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+      (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+      (i32 0), (i32 1), (i32 4), (i32 5),
+      (i32 8), (i32 9), (i32 12), (i32 13),
+      (i32 16), (i32 17), (i32 20), (i32 21),
+      (i32 24), (i32 25), (i32 28), (i32 29)),
+    (wasm_shuffle
+      (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+      (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+      (i32 2), (i32 3), (i32 6), (i32 7),
+      (i32 10), (i32 11), (i32 14), (i32 15),
+      (i32 18), (i32 19), (i32 22), (i32 23),
+      (i32 26), (i32 27), (i32 30), (i32 31)))
+  ),
+  (v8i16 (RELAXED_DOT v16i8:$lhs, v16i8:$rhs))
+>;
+
 defm RELAXED_DOT_ADD :
   RELAXED_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, V128:$acc),
             (outs), (ins),
@@ -1771,6 +1791,18 @@ defm RELAXED_DOT_ADD :
             "i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
             "i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>;
 
+def : Pat<
+  (v4i32 (add
+    (v4i32 (int_wasm_extadd_pairwise_signed
+      (v8i16 (int_wasm_relaxed_dot_i8x16_i7x16_signed v16i8:$lhs, v16i8:$rhs)))),
+    (v4i32 V128:$acc))),
+  (v4i32 (RELAXED_DOT_ADD v16i8:$lhs, v16i8:$rhs, (v4i32 V128:$acc)))
+    >;
+
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
+                                                         (v16i8 V128:$rhs))),
+          (RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>;
+
 //===----------------------------------------------------------------------===//
 // Relaxed BFloat16 dot product
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index b81641f..28fa2cd 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -414,8 +414,6 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
-  getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
-
   // fp constants
   getActionDefinitionsBuilder(G_FCONSTANT)
       .legalFor({s32, s64})
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 6db780f..8e08d16 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1338,6 +1338,10 @@ def ProcessorFeatures {
   list<SubtargetFeature> PTLFeatures =
     !listremove(ARLSFeatures, [FeatureWIDEKL]);
 
+  // Novalake
+  list<SubtargetFeature> NVLFeatures =
+      !listconcat(PTLFeatures, [FeaturePREFETCHI]);
+
   // Clearwaterforest
   list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI,
                                                   FeatureAVXVNNIINT16,
@@ -1883,6 +1887,9 @@ foreach P = ["pantherlake", "wildcatlake"] in {
 def : ProcModel<P, AlderlakePModel,
                 ProcessorFeatures.PTLFeatures, ProcessorFeatures.ADLTuning>;
 }
+def : ProcModel<"novalake", AlderlakePModel, ProcessorFeatures.NVLFeatures,
+                ProcessorFeatures.ADLTuning>;
+
 def : ProcModel<"clearwaterforest", AlderlakePModel,
                 ProcessorFeatures.CWFFeatures, ProcessorFeatures.ADLTuning>;
 def : ProcModel<"emeraldrapids", SapphireRapidsModel,
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 481a9be..1fca466f 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1946,16 +1946,14 @@ static void addConstantComments(const MachineInstr *MI,
     CASE_ARITH_RM(PMADDUBSW) {
       unsigned SrcIdx = getSrcIdx(MI, 1);
       if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-        if (C->getType()->getScalarSizeInBits() == 8) {
-          std::string Comment;
-          raw_string_ostream CS(Comment);
-          unsigned VectorWidth =
-              X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
-          CS << "[";
-          printConstant(C, VectorWidth, CS);
-          CS << "]";
-          OutStreamer.AddComment(CS.str());
-        }
+        std::string Comment;
+        raw_string_ostream CS(Comment);
+        unsigned VectorWidth =
+            X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
+        CS << "[";
+        printConstant(C, VectorWidth, CS);
+        CS << "]";
+        OutStreamer.AddComment(CS.str());
       }
       break;
     }
@@ -1967,16 +1965,14 @@ static void addConstantComments(const MachineInstr *MI,
     CASE_ARITH_RM(PMULHRSW) {
       unsigned SrcIdx = getSrcIdx(MI, 1);
       if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-        if (C->getType()->getScalarSizeInBits() == 16) {
-          std::string Comment;
-          raw_string_ostream CS(Comment);
-          unsigned VectorWidth =
-              X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
-          CS << "[";
-          printConstant(C, VectorWidth, CS);
-          CS << "]";
-          OutStreamer.AddComment(CS.str());
-        }
+        std::string Comment;
+        raw_string_ostream CS(Comment);
+        unsigned VectorWidth =
+            X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
+        CS << "[";
+        printConstant(C, VectorWidth, CS);
+        CS << "]";
+        OutStreamer.AddComment(CS.str());
       }
       break;
     }
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 3479106..6065575 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1152,6 +1152,20 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
       break;
     }
     break;
+  case 0x12:
+    switch (Model) {
+    // Novalake:
+    case 0x1:
+    case 0x3:
+      CPU = "novalake";
+      *Type = X86::INTEL_COREI7;
+      *Subtype = X86::INTEL_COREI7_NOVALAKE;
+      break;
+    default: // Unknown family 0x12 CPU.
+      break;
+    }
+    break;
+
   default:
     break; // Unknown.
   }
diff --git a/llvm/lib/TargetParser/Unix/Host.inc b/llvm/lib/TargetParser/Unix/Host.inc
index aeb2f59..38b942d 100644
--- a/llvm/lib/TargetParser/Unix/Host.inc
+++ b/llvm/lib/TargetParser/Unix/Host.inc
@@ -59,10 +59,30 @@ static std::string updateTripleOSVersion(std::string TargetTripleString) {
     if (TT.getOS() == Triple::AIX && !TT.getOSMajorVersion()) {
       struct utsname name;
       if (uname(&name) != -1) {
+        std::string release = name.release;
+
+        if (strcmp(name.sysname, "OS400") == 0) {
+          /*
+            PASE uses different versioning system than AIX.
+            The following table shows the currently supported PASE
+            releases and the corresponding AIX release:
+            --------------------------
+              PASE    |    AIX
+            --------------------------
+              V7R4    |    7.2 (TL2)
+            --------------------------
+              V7R5    |    7.2 (TL5)
+            --------------------------
+              V7R6    |    7.3 (TL1)
+            --------------------------
+          */
+          release = (release == "4" || release == "5") ? "2" : "3";
+        }
+
         std::string NewOSName = std::string(Triple::getOSTypeName(Triple::AIX));
         NewOSName += name.version;
         NewOSName += '.';
-        NewOSName += name.release;
+        NewOSName += release;
         NewOSName += ".0.0";
         TT.setOSName(NewOSName);
         return TT.str();
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index e382cfe..dd13ce3 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -176,6 +176,8 @@ constexpr FeatureBitset FeaturesArrowlakeS =
     FeatureSM4;
 constexpr FeatureBitset FeaturesPantherlake =
     (FeaturesArrowlakeS ^ FeatureWIDEKL);
+constexpr FeatureBitset FeaturesNovalake =
+    FeaturesPantherlake | FeaturePREFETCHI;
 constexpr FeatureBitset FeaturesClearwaterforest =
     (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 |
     FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
@@ -379,6 +381,8 @@ constexpr ProcInfo Processors[] = {
   // Pantherlake microarchitecture based processors.
   { {"pantherlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false },
   { {"wildcatlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false },
+  // Novalake microarchitecture based processors.
+  { {"novalake"}, CK_Novalake, FEATURE_AVX2, FeaturesNovalake, 'r', false },
   // Sierraforest microarchitecture based processors.
   { {"sierraforest"}, CK_Sierraforest, FEATURE_AVX2, FeaturesSierraforest, 'p', false },
   // Grandridge microarchitecture based processors.
diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
index f166fef..cf7e450 100644
--- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
@@ -153,26 +153,23 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
       bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine();
       bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe);
       if (IsCallerPresplitCoroutine && HasAttr) {
-        BranchProbability MinBranchProbability(
-            static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution),
-            MinBlockCounterExecution);
-
         auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
 
-        auto Prob = BranchProbability::getBranchProbability(
-            BFI.getBlockFreq(CB->getParent()).getFrequency(),
-            BFI.getEntryFreq().getFrequency());
+        auto BlockFreq = BFI.getBlockFreq(CB->getParent()).getFrequency();
+        auto EntryFreq = BFI.getEntryFreq().getFrequency();
+        uint64_t MinFreq =
+            static_cast<uint64_t>(EntryFreq * CoroElideBranchRatio);
 
-        if (Prob < MinBranchProbability) {
+        if (BlockFreq < MinFreq) {
           ORE.emit([&]() {
             return OptimizationRemarkMissed(
                        DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller)
                    << "'" << ore::NV("callee", Callee->getName())
                    << "' not elided in '"
                    << ore::NV("caller", Caller->getName())
-                   << "' because of low probability: "
-                   << ore::NV("probability", Prob) << " (threshold: "
-                   << ore::NV("threshold", MinBranchProbability) << ")";
+                   << "' because of low frequency: "
+                   << ore::NV("block_freq", BlockFreq)
+                   << " (threshold: " << ore::NV("min_freq", MinFreq) << ")";
           });
           continue;
         }
@@ -188,7 +185,8 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
           return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller)
                  << "'" << ore::NV("callee", Callee->getName())
                  << "' elided in '" << ore::NV("caller", Caller->getName())
-                 << "' (probability: " << ore::NV("probability", Prob) << ")";
+                 << "' (block_freq: " << ore::NV("block_freq", BlockFreq)
+                 << ")";
         });
 
         FAM.invalidate(*Caller, PreservedAnalyses::none());
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4c9b10a..cdc559b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -156,9 +156,9 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
   Type *Ty = CI.getType();
 
-  if (auto *SrcC = dyn_cast<Constant>(Src))
-    if (Constant *Res = ConstantFoldCastOperand(CI.getOpcode(), SrcC, Ty, DL))
-      return replaceInstUsesWith(CI, Res);
+  if (Value *Res =
+          simplifyCastInst(CI.getOpcode(), Src, Ty, SQ.getWithInstruction(&CI)))
+    return replaceInstUsesWith(CI, Res);
 
   // Try to eliminate a cast of a cast.
   if (auto *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 511bca4..6e17801 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -605,17 +605,16 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
   return Mapping;
 }
 
-namespace llvm {
-void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
-                               bool IsKasan, uint64_t *ShadowBase,
-                               int *MappingScale, bool *OrShadowOffset) {
+void llvm::getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
+                                     bool IsKasan, uint64_t *ShadowBase,
+                                     int *MappingScale, bool *OrShadowOffset) {
   auto Mapping = getShadowMapping(TargetTriple, LongSize, IsKasan);
   *ShadowBase = Mapping.Offset;
   *MappingScale = Mapping.Scale;
   *OrShadowOffset = Mapping.OrShadowOffset;
 }
 
-void removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) {
+void llvm::removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) {
   // Sanitizer checks read from shadow, which invalidates memory(argmem: *).
   //
   // This is not only true for sanitized functions, because AttrInfer can
@@ -668,8 +667,6 @@ ASanAccessInfo::ASanAccessInfo(bool IsWrite, bool CompileKernel,
       AccessSizeIndex(AccessSizeIndex), IsWrite(IsWrite),
       CompileKernel(CompileKernel) {}
 
-} // namespace llvm
-
 static uint64_t getRedzoneSizeForScale(int MappingScale) {
   // Redzone used for stack and globals is at least 32 bytes.
   // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
@@ -677,11 +674,10 @@ static uint64_t getRedzoneSizeForScale(int MappingScale) {
 }
 
 static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
-  if (TargetTriple.isOSEmscripten()) {
+  if (TargetTriple.isOSEmscripten())
     return kAsanEmscriptenCtorAndDtorPriority;
-  } else {
+  else
     return kAsanCtorAndDtorPriority;
-  }
 }
 
 static Twine genName(StringRef suffix) {
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 444b390..72e8e50 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -2092,8 +2092,6 @@ bool CHR::run() {
   return Changed;
 }
 
-namespace llvm {
-
 ControlHeightReductionPass::ControlHeightReductionPass() {
   parseCHRFilterFiles();
 }
@@ -2116,5 +2114,3 @@ PreservedAnalyses ControlHeightReductionPass::run(
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index d99f1eb..ddb99a5 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -75,8 +75,6 @@ static cl::opt<bool>
                                "expressed as branches by widenable conditions"),
                       cl::init(true));
 
-namespace {
-
 // Get the condition of \p I. It can either be a guard or a conditional branch.
 static Value *getCondition(Instruction *I) {
   if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
@@ -130,6 +128,8 @@ findInsertionPointForWideCondition(Instruction *WCOrGuard) {
   return std::nullopt;
 }
 
+namespace {
+
 class GuardWideningImpl {
   DominatorTree &DT;
   PostDominatorTree *PDT;
@@ -328,7 +328,7 @@ public:
   /// The entry point for this pass.
   bool run();
 };
-}
+} // namespace
 
 static bool isSupportedGuardInstruction(const Instruction *Insn) {
   if (isGuard(Insn))
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index c327311..7ebcc21 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -53,6 +53,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
@@ -117,6 +118,10 @@ static cl::opt<bool>
 LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
                 cl::desc("Predicate conditions in read only loops"));
 
+static cl::opt<bool> LoopPredicationTraps(
+    "indvars-predicate-loop-traps", cl::Hidden, cl::init(true),
+    cl::desc("Predicate conditions that trap in loops with only local writes"));
+
 static cl::opt<bool>
 AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
                 cl::desc("Allow widening of indvars to eliminate s/zext"));
@@ -1704,6 +1709,24 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
   return Changed;
 }
 
+static bool crashingBBWithoutEffect(const BasicBlock &BB) {
+  return llvm::all_of(BB, [](const Instruction &I) {
+    // TODO: for now this is overly restrictive, to make sure nothing in this
+    // BB can depend on the loop body.
+    // It's not enough to check for !I.mayHaveSideEffects(), because e.g. a
+    // load does not have a side effect, but we could have
+    // %a = load ptr, ptr %ptr
+    // %b = load i32, ptr %a
+    // Now if the loop stored a non-nullptr to %a, we could cause a nullptr
+    // dereference by skipping over loop iterations.
+    if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      if (CB->onlyAccessesInaccessibleMemory())
+        return true;
+    }
+    return isa<UnreachableInst>(I);
+  });
+}
+
 bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   SmallVector<BasicBlock*, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
@@ -1816,11 +1839,25 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // suggestions on how to improve this?  I can obviously bail out for outer
   // loops, but that seems less than ideal.  MemorySSA can find memory writes,
   // is that enough for *all* side effects?
+  bool HasThreadLocalSideEffects = false;
   for (BasicBlock *BB : L->blocks())
     for (auto &I : *BB)
       // TODO:isGuaranteedToTransfer
-      if (I.mayHaveSideEffects())
-        return false;
+      if (I.mayHaveSideEffects()) {
+        if (!LoopPredicationTraps)
+          return false;
+        HasThreadLocalSideEffects = true;
+        if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+          // Simple stores cannot be observed by other threads.
+          // If HasThreadLocalSideEffects is set, we check
+          // crashingBBWithoutEffect to make sure that the crashing BB cannot
+          // observe them either.
+          if (!SI->isSimple())
+            return false;
+        } else {
+          return false;
+        }
+      }
 
   bool Changed = false;
   // Finally, do the actual predication for all predicatable blocks.  A couple
@@ -1840,6 +1877,19 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
 
     auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    if (HasThreadLocalSideEffects) {
+      const BasicBlock *Unreachable = nullptr;
+      for (const BasicBlock *Succ : BI->successors()) {
+        if (isa<UnreachableInst>(Succ->getTerminator()))
+          Unreachable = Succ;
+      }
+      // Exit BB which have one branch back into the loop and another one to
+      // a trap can still be optimized, because local side effects cannot
+      // be observed in the exit case (the trap). We could be smarter about
+      // this, but for now lets pattern match common cases that directly trap.
+      if (Unreachable == nullptr || !crashingBBWithoutEffect(*Unreachable))
+        return Changed;
+    }
     Value *NewCond;
     if (ExitCount == ExactBTC) {
       NewCond = L->contains(BI->getSuccessor(0)) ?
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 3c14036e..6fb8197 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -26,8 +26,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-
 static cl::opt<unsigned>
     JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden,
                            cl::desc("Only split jump tables with size less or "
@@ -43,8 +41,8 @@ static cl::opt<unsigned> FunctionSizeThreshold(
              "or equal than this threshold."),
     cl::init(50));
 
+namespace llvm {
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
 } // end namespace llvm
 
 #define DEBUG_TYPE "jump-table-to-switch"
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 9655173..b2c526b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -116,8 +116,6 @@ STATISTIC(NumIntAssociationsHoisted,
 STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
                                     "reassociated and hoisted out of the loop");
 
-namespace llvm {
-
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
     DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
@@ -156,7 +154,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit(
 // which may not be precise, since optimizeUses is capped. The result is
 // correct, but we may not get as "far up" as possible to get which access is
 // clobbering the one queried.
-cl::opt<unsigned> SetLicmMssaOptCap(
+cl::opt<unsigned> llvm::SetLicmMssaOptCap(
     "licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
     cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
              "for faster compile. Caps the MemorySSA clobbering calls."));
@@ -164,15 +162,15 @@ cl::opt<unsigned> SetLicmMssaOptCap(
 // Experimentally, memory promotion carries less importance than sinking and
 // hoisting. Limit when we do promotion when using MemorySSA, in order to save
 // compile time.
-cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap(
+cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
     "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
     cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
              "effect. When MSSA in LICM is enabled, then this is the maximum "
              "number of accesses allowed to be present in a loop in order to "
              "enable memory promotion."));
 
+namespace llvm {
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
 } // end namespace llvm
 
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
@@ -1120,11 +1118,10 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
   return false;
 }
 
-namespace {
 /// Return true if-and-only-if we know how to (mechanically) both hoist and
 /// sink a given instruction out of a loop.  Does not address legality
 /// concerns such as aliasing or speculation safety.
-bool isHoistableAndSinkableInst(Instruction &I) {
+static bool isHoistableAndSinkableInst(Instruction &I) {
   // Only these instructions are hoistable/sinkable.
   return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
           isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) ||
@@ -1136,8 +1133,8 @@ bool isHoistableAndSinkableInst(Instruction &I) {
 }
 
 /// Return true if I is the only Instruction with a MemoryAccess in L.
-bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
-                        const MemorySSAUpdater &MSSAU) {
+static bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
+                               const MemorySSAUpdater &MSSAU) {
   for (auto *BB : L->getBlocks())
     if (auto *Accs = MSSAU.getMemorySSA()->getBlockAccesses(BB)) {
       int NotAPhi = 0;
@@ -1151,7 +1148,6 @@ bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
     }
   return true;
 }
-}
 
 static MemoryAccess *getClobberingMemoryAccess(MemorySSA &MSSA,
                                                BatchAAResults &BAA,
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 73f1942..7706de8 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -21,8 +21,7 @@
 
 #define DEBUG_TYPE "loop-bound-split"
 
-namespace llvm {
-
+using namespace llvm;
 using namespace PatternMatch;
 
 namespace {
@@ -358,8 +357,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
   IRBuilder<> Builder(&PostLoopPreHeader->front());
 
   // Update phi nodes in header of post-loop.
-  bool isExitingLatch =
-      (L.getExitingBlock() == L.getLoopLatch()) ? true : false;
+  bool isExitingLatch = L.getExitingBlock() == L.getLoopLatch();
   Value *ExitingCondLCSSAPhi = nullptr;
   for (PHINode &PN : L.getHeader()->phis()) {
     // Create LCSSA phi node in preheader of post-loop.
@@ -472,8 +470,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
 PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM,
                                           LoopStandardAnalysisResults &AR,
                                           LPMUpdater &U) {
-  Function &F = *L.getHeader()->getParent();
-  (void)F;
+  [[maybe_unused]] Function &F = *L.getHeader()->getParent();
 
   LLVM_DEBUG(dbgs() << "Spliting bound of loop in " << F.getName() << ": " << L
                     << "\n");
@@ -486,5 +483,3 @@ PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM,
 
   return getLoopPassPreservedAnalyses();
 }
-
-} // end namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 32078b1..7da8586 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -16,8 +16,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-
 /// Explicitly specialize the pass manager's run method to handle loop nest
 /// structure updates.
 PreservedAnalyses
@@ -185,7 +183,6 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
   }
   return PA;
 }
-} // namespace llvm
 
 void FunctionToLoopPassAdaptor::printPipeline(
     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
@@ -193,6 +190,7 @@ void FunctionToLoopPassAdaptor::printPipeline(
   Pass->printPipeline(OS, MapClassName2PassName);
   OS << ')';
 }
+
 PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   // Before we even compute any loop analyses, first run a miniature function
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 448dc2b..f3e6cbf 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -540,8 +540,6 @@ bool LoopVersioningLICM::run(DominatorTree *DT) {
   return Changed;
 }
 
-namespace llvm {
-
 PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
                                               LoopStandardAnalysisResults &LAR,
                                               LPMUpdater &U) {
@@ -556,4 +554,3 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
 }
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 80aa98d..5a8f18a 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -160,9 +160,6 @@ static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
 //===----------------------------------------------------------------------===//
 
 // Anchor methods.
-namespace llvm {
-namespace GVNExpression {
-
 Expression::~Expression() = default;
 BasicExpression::~BasicExpression() = default;
 CallExpression::~CallExpression() = default;
@@ -171,9 +168,6 @@ StoreExpression::~StoreExpression() = default;
 AggregateValueExpression::~AggregateValueExpression() = default;
 PHIExpression::~PHIExpression() = default;
 
-} // end namespace GVNExpression
-} // end namespace llvm
-
 namespace {
 
 // Tarjan's SCC finding algorithm with Nuutila's improvements
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index ba58b8e..6d7ce36 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -2623,32 +2623,32 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
 
 namespace {
 
-  class ReassociateLegacyPass : public FunctionPass {
-    ReassociatePass Impl;
+class ReassociateLegacyPass : public FunctionPass {
+  ReassociatePass Impl;
 
-  public:
-    static char ID; // Pass identification, replacement for typeid
+public:
+  static char ID; // Pass identification, replacement for typeid
 
-    ReassociateLegacyPass() : FunctionPass(ID) {
-      initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
+  ReassociateLegacyPass() : FunctionPass(ID) {
+    initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnFunction(Function &F) override {
-      if (skipFunction(F))
-        return false;
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
 
-      FunctionAnalysisManager DummyFAM;
-      auto PA = Impl.run(F, DummyFAM);
-      return !PA.areAllPreserved();
-    }
+    FunctionAnalysisManager DummyFAM;
+    auto PA = Impl.run(F, DummyFAM);
+    return !PA.areAllPreserved();
+  }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addPreserved<AAResultsWrapperPass>();
-      AU.addPreserved<BasicAAWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-    }
-  };
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
 
 } // end anonymous namespace
 
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index aae5d60..25a531c 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -50,9 +50,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "scalarizer"
 
-namespace {
-
-BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) {
+static BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) {
   BasicBlock *BB = Itr->getParent();
   if (isa<PHINode>(Itr))
     Itr = BB->getFirstInsertionPt();
@@ -76,6 +74,8 @@ using ScatterMap = std::map<std::pair<Value *, Type *>, ValueVector>;
 // along with a pointer to their scattered forms.
 using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
 
+namespace {
+
 struct VectorSplit {
   // The type of the vector.
   FixedVectorType *VecTy = nullptr;
@@ -196,6 +196,7 @@ struct VectorLayout {
   // The size of each (non-remainder) fragment in bytes.
   uint64_t SplitSize = 0;
 };
+} // namespace
 
 static bool isStructOfMatchingFixedVectors(Type *Ty) {
   if (!isa<StructType>(Ty))
@@ -268,6 +269,7 @@ static Value *concatenate(IRBuilder<> &Builder, ArrayRef<Value *> Fragments,
   return Res;
 }
 
+namespace {
 class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
 public:
   ScalarizerVisitor(DominatorTree *DT, const TargetTransformInfo *TTI,
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index ebcbd2b..fa66a03 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -149,8 +149,6 @@ bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
   return Impl.runImpl(F, TTI);
 }
 
-namespace llvm {
-
 bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
   if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence(&F)) {
     LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
@@ -328,11 +326,11 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   return true;
 }
 
-FunctionPass *createSpeculativeExecutionPass() {
+FunctionPass *llvm::createSpeculativeExecutionPass() {
   return new SpeculativeExecutionLegacyPass();
 }
 
-FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
+FunctionPass *llvm::createSpeculativeExecutionIfHasBranchDivergencePass() {
   return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
 }
 
@@ -362,4 +360,3 @@ void SpeculativeExecutionPass::printPipeline(
     OS << "only-if-divergent-target";
   OS << '>';
 }
-}  // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 7d01709..e94ad19 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -716,8 +716,6 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
   return Ret;
 }
 
-namespace llvm {
-
 PreservedAnalyses
 StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
   const DataLayout *DL = &F.getDataLayout();
@@ -735,5 +733,3 @@ StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
   PA.preserve<TargetIRAnalysis>();
   return PA;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 1d83ddc..89d41f3e 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -192,7 +192,7 @@ struct AllocaDerivedValueTracker {
   SmallPtrSet<Instruction *, 32> AllocaUsers;
   SmallPtrSet<Instruction *, 32> EscapePoints;
 };
-}
+} // namespace
 
 static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
   if (F.callsFunctionThatReturnsTwice())
@@ -967,7 +967,7 @@ struct TailCallElim : public FunctionPass {
         /*BFI=*/nullptr);
   }
 };
-}
+} // namespace
 
 char TailCallElim::ID = 0;
 INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 62a81ba..280eb20 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7957,9 +7957,9 @@ bool VPRecipeBuilder::getScaledReductions(
   auto CollectExtInfo = [this, &Exts, &ExtOpTypes,
                          &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool {
     for (const auto &[I, OpI] : enumerate(Ops)) {
-      auto *CI = dyn_cast<ConstantInt>(OpI);
-      if (I > 0 && CI &&
-          canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) {
+      const APInt *C;
+      if (I > 0 && match(OpI, m_APInt(C)) &&
+          canConstantBeExtended(C, ExtOpTypes[0], ExtKinds[0])) {
         ExtOpTypes[I] = ExtOpTypes[0];
         ExtKinds[I] = ExtKinds[0];
         continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 0101942..d167009 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1753,14 +1753,14 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
 }
 #endif
 
-bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType,
                                  TTI::PartialReductionExtendKind ExtKind) {
-  APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits());
-  unsigned WideSize = CI->getType()->getScalarSizeInBits();
+  APInt TruncatedVal = C->trunc(NarrowType->getScalarSizeInBits());
+  unsigned WideSize = C->getBitWidth();
   APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
                           ? TruncatedVal.sext(WideSize)
                           : TruncatedVal.zext(WideSize);
-  return ExtendedVal == CI->getValue();
+  return ExtendedVal == *C;
 }
 
 TargetTransformInfo::OperandValueInfo
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 1580a3b..2aaabd9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -474,7 +474,7 @@ public:
 
 /// Check if a constant \p CI can be safely treated as having been extended
 /// from a narrower type with the given extension kind.
-bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+bool canConstantBeExtended(const APInt *C, Type *NarrowType,
                            TTI::PartialReductionExtendKind ExtKind);
 } // end namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index ff286f7..d8203e2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -173,10 +173,10 @@ inline int_pred_ty<is_zero_int> m_ZeroInt() {
 /// For vectors, this includes constants with undefined elements.
 inline int_pred_ty<is_one> m_One() { return int_pred_ty<is_one>(); }
 
-struct bind_const_int {
-  uint64_t &Res;
+struct bind_apint {
+  const APInt *&Res;
 
-  bind_const_int(uint64_t &Res) : Res(Res) {}
+  bind_apint(const APInt *&Res) : Res(Res) {}
 
   bool match(VPValue *VPV) const {
     if (!VPV->isLiveIn())
@@ -188,7 +188,23 @@ struct bind_const_int {
     const auto *CI = dyn_cast<ConstantInt>(V);
     if (!CI)
       return false;
-    if (auto C = CI->getValue().tryZExtValue()) {
+    Res = &CI->getValue();
+    return true;
+  }
+};
+
+inline bind_apint m_APInt(const APInt *&C) { return C; }
+
+struct bind_const_int {
+  uint64_t &Res;
+
+  bind_const_int(uint64_t &Res) : Res(Res) {}
+
+  bool match(VPValue *VPV) const {
+    const APInt *APConst;
+    if (!bind_apint(APConst).match(VPV))
+      return false;
+    if (auto C = APConst->tryZExtValue()) {
       Res = *C;
       return true;
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 775837f..7a98c75 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -341,12 +341,12 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
     ExtAType = GetExtendKind(ExtAR);
     ExtBType = GetExtendKind(ExtBR);
 
-    if (!ExtBR && Widen->getOperand(1)->isLiveIn()) {
-      auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue());
-      if (canConstantBeExtended(CI, InputTypeA, ExtAType)) {
-        InputTypeB = InputTypeA;
-        ExtBType = ExtAType;
-      }
+    using namespace VPlanPatternMatch;
+    const APInt *C;
+    if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) &&
+        canConstantBeExtended(C, InputTypeA, ExtAType)) {
+      InputTypeB = InputTypeA;
+      ExtBType = ExtAType;
     }
   };
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8d76b2d8..cae9aee8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2122,9 +2122,18 @@ static void licm(VPlan &Plan) {
   VPBasicBlock *Preheader = Plan.getVectorPreheader();
 
   // Return true if we do not know how to (mechanically) hoist a given recipe
-  // out of a loop region. Does not address legality concerns such as aliasing
-  // or speculation safety.
+  // out of a loop region.
   auto CannotHoistRecipe = [](VPRecipeBase &R) {
+    // Assumes don't alias anything or throw; as long as they're guaranteed to
+    // execute, they're safe to hoist.
+    if (match(&R, m_Intrinsic<Intrinsic::assume>()))
+      return false;
+
+    // TODO: Relax checks in the future, e.g. we could also hoist reads, if
+    // their memory location is not modified in the vector loop.
+    if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
+      return true;
+
     // Allocas cannot be hoisted.
     auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
     return RepR && RepR->getOpcode() == Instruction::Alloca;
@@ -2132,17 +2141,18 @@ static void licm(VPlan &Plan) {
 
   // Hoist any loop invariant recipes from the vector loop region to the
   // preheader. Preform a shallow traversal of the vector loop region, to
-  // exclude recipes in replicate regions.
+  // exclude recipes in replicate regions. Since the top-level blocks in the
+  // vector loop region are guaranteed to execute if the vector pre-header is,
+  // we don't need to check speculation safety.
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  assert(Preheader->getSingleSuccessor() == LoopRegion &&
+         "Expected vector prehader's successor to be the vector loop region");
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(LoopRegion->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       if (CannotHoistRecipe(R))
         continue;
-      // TODO: Relax checks in the future, e.g. we could also hoist reads, if
-      // their memory location is not modified in the vector loop.
-      if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
-          any_of(R.operands(), [](VPValue *Op) {
+      if (any_of(R.operands(), [](VPValue *Op) {
             return !Op->isDefinedOutsideLoopRegions();
           }))
         continue;