23 files changed, 408 insertions, 228 deletions
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index a0fe7f9..22229d9 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -320,8 +320,12 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
     return CaptureComponents::None;
   case Instruction::Store:
     // Stored the pointer - conservatively assume it may be captured.
+    if (U.getOperandNo() == 0)
+      return MDNode::toCaptureComponents(
+          I->getMetadata(LLVMContext::MD_captures));
+
     // Volatile stores make the address observable.
-    if (U.getOperandNo() == 0 || cast<StoreInst>(I)->isVolatile())
+    if (cast<StoreInst>(I)->isVolatile())
       return CaptureComponents::All;
     return CaptureComponents::None;
   case Instruction::AtomicRMW: {
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 6fb2807..0e5bc48 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1632,19 +1632,25 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
                 *getValueFromCondition(Usr->getOperand(0), Condition,
                                        isTrueDest, /*UseBlockValue*/ false);
 
-            if (!OpLatticeVal.isConstantRange())
-              return OpLatticeVal;
+            if (OpLatticeVal.isConstantRange()) {
+              const unsigned ResultBitWidth =
+                  Usr->getType()->getScalarSizeInBits();
+              if (auto *Trunc = dyn_cast<TruncInst>(Usr))
+                return ValueLatticeElement::getRange(
+                    OpLatticeVal.getConstantRange().truncate(
+                        ResultBitWidth, Trunc->getNoWrapKind()));
 
-            const unsigned ResultBitWidth =
-                Usr->getType()->getScalarSizeInBits();
-            if (auto *Trunc = dyn_cast<TruncInst>(Usr))
               return ValueLatticeElement::getRange(
-                  OpLatticeVal.getConstantRange().truncate(
-                      ResultBitWidth, Trunc->getNoWrapKind()));
-
-            return ValueLatticeElement::getRange(
-                OpLatticeVal.getConstantRange().castOp(
-                    cast<CastInst>(Usr)->getOpcode(), ResultBitWidth));
+                  OpLatticeVal.getConstantRange().castOp(
+                      cast<CastInst>(Usr)->getOpcode(), ResultBitWidth));
+            }
+            if (OpLatticeVal.isConstant()) {
+              Constant *C = OpLatticeVal.getConstant();
+              if (auto *CastC = ConstantFoldCastOperand(
+                      cast<CastInst>(Usr)->getOpcode(), C, Usr->getType(), DL))
+                return ValueLatticeElement::get(CastC);
+            }
+            return ValueLatticeElement::getOverdefined();
           } else {
             // If one of Val's operand has an inferred value, we may be able to
             // infer the value of Val.
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 09ac0f1..f794780 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -599,8 +599,7 @@ Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromMachOObject_arm64(
 }
 
 static Error applyPACSigningToModInitPointers(LinkGraph &G) {
-  assert(G.getTargetTriple().getSubArch() == Triple::AArch64SubArch_arm64e &&
-         "PAC signing only valid for arm64e");
+  assert(G.getTargetTriple().isArm64e() && "PAC signing only valid for arm64e");
 
   if (auto *ModInitSec = G.findSectionByName("__DATA,__mod_init_func")) {
     for (auto *B : ModInitSec->blocks()) {
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 9cfb0ff..1add0c7 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ModRef.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -1435,6 +1436,40 @@ MDNode *MDNode::getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B) {
   return B;
 }
 
+CaptureComponents MDNode::toCaptureComponents(const MDNode *MD) {
+  if (!MD)
+    return CaptureComponents::All;
+
+  CaptureComponents CC = CaptureComponents::None;
+  for (Metadata *Op : MD->operands()) {
+    CaptureComponents Component =
+        StringSwitch<CaptureComponents>(cast<MDString>(Op)->getString())
+            .Case("address", CaptureComponents::Address)
+            .Case("address_is_null", CaptureComponents::AddressIsNull)
+            .Case("provenance", CaptureComponents::Provenance)
+            .Case("read_provenance", CaptureComponents::ReadProvenance);
+    CC |= Component;
+  }
+  return CC;
+}
+
+MDNode *MDNode::fromCaptureComponents(LLVMContext &Ctx, CaptureComponents CC) {
+  assert(!capturesNothing(CC) && "Can't encode captures(none)");
+  if (capturesAll(CC))
+    return nullptr;
+
+  SmallVector<Metadata *> Components;
+  if (capturesAddressIsNullOnly(CC))
+    Components.push_back(MDString::get(Ctx, "address_is_null"));
+  else if (capturesAddress(CC))
+    Components.push_back(MDString::get(Ctx, "address"));
+  if (capturesReadProvenanceOnly(CC))
+    Components.push_back(MDString::get(Ctx, "read_provenance"));
+  else if (capturesFullProvenance(CC))
+    Components.push_back(MDString::get(Ctx, "provenance"));
+  return MDNode::get(Ctx, Components);
+}
+
 //===----------------------------------------------------------------------===//
 // NamedMDNode implementation.
 //
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 8c03d6f..6b3cd27 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -542,6 +542,7 @@ private:
   void visitAliasScopeMetadata(const MDNode *MD);
   void visitAliasScopeListMetadata(const MDNode *MD);
   void visitAccessGroupMetadata(const MDNode *MD);
+  void visitCapturesMetadata(Instruction &I, const MDNode *Captures);
 
   template <class Ty> bool isValidMetadataArray(const MDTuple &N);
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
@@ -5373,6 +5374,27 @@ void Verifier::visitAccessGroupMetadata(const MDNode *MD) {
   }
 }
 
+void Verifier::visitCapturesMetadata(Instruction &I, const MDNode *Captures) {
+  static const char *ValidArgs[] = {"address_is_null", "address",
+                                    "read_provenance", "provenance"};
+
+  auto *SI = dyn_cast<StoreInst>(&I);
+  Check(SI, "!captures metadata can only be applied to store instructions", &I);
+  Check(SI->getValueOperand()->getType()->isPointerTy(),
+        "!captures metadata can only be applied to store with value operand of "
+        "pointer type",
+        &I);
+  Check(Captures->getNumOperands() != 0, "!captures metadata cannot be empty",
+        &I);
+
+  for (Metadata *Op : Captures->operands()) {
+    auto *Str = dyn_cast<MDString>(Op);
+    Check(Str, "!captures metadata must be a list of strings", &I);
+    Check(is_contained(ValidArgs, Str->getString()),
+          "invalid entry in !captures metadata", &I, Str);
+  }
+}
+
 /// verifyInstruction - Verify that an instruction is well formed.
 ///
 void Verifier::visitInstruction(Instruction &I) {
@@ -5600,6 +5622,9 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *Annotation = I.getMetadata(LLVMContext::MD_annotation))
     visitAnnotationMetadata(Annotation);
 
+  if (MDNode *Captures = I.getMetadata(LLVMContext::MD_captures))
+    visitCapturesMetadata(I, Captures);
+
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N, AreDebugLocsAllowed::Yes);
diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp
index a6a9628a..0dd378e 100644
--- a/llvm/lib/Object/OffloadBundle.cpp
+++ b/llvm/lib/Object/OffloadBundle.cpp
@@ -120,15 +120,14 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset,
   if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle)
     return errorCodeToError(object_error::parse_failed);
 
-  std::unique_ptr<OffloadBundleFatBin> TheBundle(
-      new OffloadBundleFatBin(Buf, FileName));
+  OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, FileName);
 
   // Read the Bundle Entries
   Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset);
   if (Err)
     return Err;
 
-  return TheBundle;
+  return std::unique_ptr<OffloadBundleFatBin>(TheBundle);
 }
 
 Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) {
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 6275e5e..47860c0 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -329,6 +329,36 @@ struct Tag {
   size_t StartPosition = StringRef::npos;
 };
 
+[[maybe_unused]] static const char *tagKindToString(Tag::Kind K) {
+  switch (K) {
+  case Tag::Kind::None:
+    return "None";
+  case Tag::Kind::Normal:
+    return "Normal";
+  case Tag::Kind::Triple:
+    return "Triple";
+  }
+  llvm_unreachable("Unknown Tag::Kind");
+}
+
+[[maybe_unused]] static const char *jsonKindToString(json::Value::Kind K) {
+  switch (K) {
+  case json::Value::Kind::Null:
+    return "JSON_KIND_NULL";
+  case json::Value::Kind::Boolean:
+    return "JSON_KIND_BOOLEAN";
+  case json::Value::Kind::Number:
+    return "JSON_KIND_NUMBER";
+  case json::Value::Kind::String:
+    return "JSON_KIND_STRING";
+  case json::Value::Kind::Array:
+    return "JSON_KIND_ARRAY";
+  case json::Value::Kind::Object:
+    return "JSON_KIND_OBJECT";
+  }
+  llvm_unreachable("Unknown json::Value::Kind");
+}
+
 static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
                        StringRef Close) {
   const StringLiteral TripleOpen("{{{");
@@ -373,11 +403,10 @@ static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
 
 static std::optional<std::pair<StringRef, StringRef>>
 processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
-  LLVM_DEBUG(dbgs() << "  Found tag: \"" << T.FullMatch << "\", Content: \""
-                    << T.Content << "\"\n");
+  LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
+                    << ", Kind: " << tagKindToString(T.TagKind) << "\n");
   if (T.TagKind == Tag::Kind::Triple) {
     Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&');
-    LLVM_DEBUG(dbgs() << "  Created UnescapeVariable token.\n");
     return std::nullopt;
   }
   StringRef Interpolated = T.Content;
@@ -385,7 +414,6 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
   if (!Interpolated.trim().starts_with("=")) {
     char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
     Tokens.emplace_back(RawBody, Interpolated.str(), Front);
-    LLVM_DEBUG(dbgs() << "  Created tag token of type '" << Front << "'\n");
     return std::nullopt;
   }
   Tokens.emplace_back(RawBody, Interpolated.str(), '=');
@@ -395,8 +423,8 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
   DelimSpec = DelimSpec.trim();
 
   std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
-  LLVM_DEBUG(dbgs() << "  Found Set Delimiter tag. NewOpen='" << Ret.first
-                    << "', NewClose='" << Ret.second << "'\n");
+  LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first
+                    << ", NewClose: " << Ret.second << "\n");
   return Ret;
 }
 
@@ -405,15 +433,15 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
 // but we don't support that here. An unescape variable
 // is represented only by {{& variable}}.
 static SmallVector<Token> tokenize(StringRef Template) {
-  LLVM_DEBUG(dbgs() << "Tokenizing template: \"" << Template << "\"\n");
+  LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
   SmallString<8> Open("{{");
   SmallString<8> Close("}}");
   size_t Start = 0;
 
   while (Start < Template.size()) {
-    LLVM_DEBUG(dbgs() << "Loop start. Start=" << Start << ", Open='" << Open
-                      << "', Close='" << Close << "'\n");
+    LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open
+                      << "', Close:'" << Close << "'\n");
     Tag T = findNextTag(Template, Start, Open, Close);
 
     if (T.TagKind == Tag::Kind::None) {
@@ -428,7 +456,6 @@ static SmallVector<Token> tokenize(StringRef Template) {
     if (T.StartPosition > Start) {
       StringRef Text = Template.substr(Start, T.StartPosition - Start);
       Tokens.emplace_back(Text.str());
-      LLVM_DEBUG(dbgs() << "  Created Text token: \"" << Text << "\"\n");
     }
 
     if (auto NewDelims = processTag(T, Tokens)) {
@@ -479,7 +506,6 @@ static SmallVector<Token> tokenize(StringRef Template) {
     if ((!HasTextBehind && !HasTextAhead) || (!HasTextBehind && Idx == LastIdx))
       stripTokenBefore(Tokens, Idx, CurrentToken, CurrentType);
   }
-  LLVM_DEBUG(dbgs() << "Tokenizing finished.\n");
   return Tokens;
 }
 
@@ -545,8 +571,8 @@ protected:
     Indent.resize(Indentation, ' ');
 
     for (char C : Data) {
-      LLVM_DEBUG(dbgs() << "IndentationStream: NeedsIndent=" << NeedsIndent
-                        << ", C='" << C << "', Indentation=" << Indentation
+      LLVM_DEBUG(dbgs() << "[Indentation Stream] NeedsIndent:" << NeedsIndent
+                        << ", C:'" << C << "', Indentation:" << Indentation
                         << "\n");
       if (NeedsIndent && C != '\n') {
         WrappedStream << Indent;
@@ -654,7 +680,9 @@ void Parser::parseMustache(ASTNode *Parent) {
   }
 }
 static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
-  LLVM_DEBUG(dbgs() << "toMustacheString: kind=" << (int)Data.kind() << "\n");
+  LLVM_DEBUG(dbgs() << "[To Mustache String] Kind: "
+                    << jsonKindToString(Data.kind()) << ", Data: " << Data
+                    << "\n");
   switch (Data.kind()) {
   case json::Value::Null:
     return;
@@ -667,7 +695,6 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
   }
   case json::Value::String: {
     auto Str = *Data.getAsString();
-    LLVM_DEBUG(dbgs() << "  --> writing string: \"" << Str << "\"\n");
     OS << Str.str();
     return;
   }
@@ -696,8 +723,8 @@ void ASTNode::renderText(MustacheOutputStream &OS) { OS << Body; }
 
 void ASTNode::renderPartial(const json::Value &CurrentCtx,
                             MustacheOutputStream &OS) {
-  LLVM_DEBUG(dbgs() << "renderPartial: Accessor=" << AccessorValue[0]
-                    << ", Indentation=" << Indentation << "\n");
+  LLVM_DEBUG(dbgs() << "[Render Partial] Accessor:" << AccessorValue[0]
+                    << ", Indentation:" << Indentation << "\n");
   auto Partial = Ctx.Partials.find(AccessorValue[0]);
   if (Partial != Ctx.Partials.end())
     renderPartial(CurrentCtx, OS, Partial->getValue().get());
@@ -716,13 +743,12 @@ void ASTNode::renderVariable(const json::Value &CurrentCtx,
 
 void ASTNode::renderUnescapeVariable(const json::Value &CurrentCtx,
                                      MustacheOutputStream &OS) {
-  LLVM_DEBUG(dbgs() << "renderUnescapeVariable: Accessor=" << AccessorValue[0]
+  LLVM_DEBUG(dbgs() << "[Render UnescapeVariable] Accessor:" << AccessorValue[0]
                     << "\n");
   auto Lambda = Ctx.Lambdas.find(AccessorValue[0]);
   if (Lambda != Ctx.Lambdas.end()) {
     renderLambdas(CurrentCtx, OS, Lambda->getValue());
   } else if (const json::Value *ContextPtr = findContext()) {
-    LLVM_DEBUG(dbgs() << "  --> Found context value, writing to stream.\n");
     OS.suspendIndentation();
     toMustacheString(*ContextPtr, OS);
     OS.resumeIndentation();
@@ -792,8 +818,6 @@ void ASTNode::render(const llvm::json::Value &Data, MustacheOutputStream &OS) {
 }
 
 const json::Value *ASTNode::findContext() {
-  LLVM_DEBUG(dbgs() << "findContext: AccessorValue[0]=" << AccessorValue[0]
-                    << "\n");
   // The mustache spec allows for dot notation to access nested values
   // a single dot refers to the current context.
   // We attempt to find the JSON context in the current node, if it is not
@@ -808,22 +832,12 @@ const json::Value *ASTNode::findContext() {
   StringRef CurrentAccessor = AccessorValue[0];
   ASTNode *CurrentParent = Parent;
 
-  LLVM_DEBUG(dbgs() << "findContext: ParentContext: ";
-             if (ParentContext) ParentContext->print(dbgs());
-             else dbgs() << "nullptr"; dbgs() << "\n");
-
   while (!CurrentContext || !CurrentContext->get(CurrentAccessor)) {
-    LLVM_DEBUG(dbgs() << "findContext: climbing parent\n");
     if (CurrentParent->Ty != Root) {
       CurrentContext = CurrentParent->ParentContext->getAsObject();
       CurrentParent = CurrentParent->Parent;
-      LLVM_DEBUG(dbgs() << "findContext: new ParentContext: ";
-                 if (CurrentParent->ParentContext)
-                     CurrentParent->ParentContext->print(dbgs());
-                 else dbgs() << "nullptr"; dbgs() << "\n");
       continue;
     }
-    LLVM_DEBUG(dbgs() << "findContext: reached root, not found\n");
     return nullptr;
   }
   const json::Value *Context = nullptr;
@@ -839,9 +853,6 @@ const json::Value *ASTNode::findContext() {
       Context = CurrentValue;
     }
   }
-  LLVM_DEBUG(dbgs() << "findContext: found value: ";
-             if (Context) Context->print(dbgs()); else dbgs() << "nullptr";
-             dbgs() << "\n");
   return Context;
 }
 
@@ -853,8 +864,7 @@ void ASTNode::renderChild(const json::Value &Contexts,
 
 void ASTNode::renderPartial(const json::Value &Contexts,
                             MustacheOutputStream &OS, ASTNode *Partial) {
-  LLVM_DEBUG(dbgs() << "renderPartial (helper): Indentation=" << Indentation
-                    << "\n");
+  LLVM_DEBUG(dbgs() << "[Render Partial Indentation] Indentation: " << Indentation << "\n");
   AddIndentationStringStream IS(OS, Indentation);
   Partial->render(Contexts, IS);
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f291191..3f9a1f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -495,13 +495,6 @@ public:
   bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
   bool run(MachineFunction &MF);
 
-  bool isForceEmitWaitcnt() const {
-    for (auto T : inst_counter_types())
-      if (ForceEmitWaitcnt[T])
-        return true;
-    return false;
-  }
-
   void setForceEmitWaitcnt() {
 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
 // For debug builds, get the debug counter info and adjust if need be
@@ -570,10 +563,6 @@ public:
     return VmemReadMapping[getVmemType(Inst)];
   }
 
-  bool hasXcnt() const { return ST->hasWaitXCnt(); }
-
-  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
-  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
   bool isVmemAccess(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
@@ -591,7 +580,6 @@ public:
                              WaitcntBrackets &ScoreBrackets);
   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
                             WaitcntBrackets &ScoreBrackets);
-  static bool asynchronouslyWritesSCC(unsigned Opcode);
 };
 
 // This objects maintains the current score brackets of each wait counter, and
@@ -1109,7 +1097,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         setRegScore(FIRST_LDS_VGPR, T, CurrScore);
     }
 
-    if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
+    if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
       setRegScore(SCC, T, CurrScore);
       PendingSCCWrite = &Inst;
     }
@@ -1831,12 +1819,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
   return Modified;
 }
 
-static bool readsVCCZ(const MachineInstr &MI) {
-  unsigned Opc = MI.getOpcode();
-  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
-         !MI.getOperand(1).isUndef();
-}
-
 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
   // Currently all conventions wait, but this may not always be the case.
@@ -1871,26 +1853,24 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   assert(!MI.isMetaInstruction());
 
   AMDGPU::Waitcnt Wait;
+  const unsigned Opc = MI.getOpcode();
 
   // FIXME: This should have already been handled by the memory legalizer.
   // Removing this currently doesn't affect any lit tests, but we need to
   // verify that nothing was relying on this. The number of buffer invalidates
   // being handled here should not be expanded.
-  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
-      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
-      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
-      MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
-      MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
+  if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
+      Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
+      Opc == AMDGPU::BUFFER_GL1_INV) {
     Wait.LoadCnt = 0;
   }
 
   // All waits must be resolved at call return.
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
-  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
-      MI.getOpcode() == AMDGPU::SI_RETURN ||
-      MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
-      MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+  if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
+      Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
+      Opc == AMDGPU::S_SETPC_B64_return ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
   }
@@ -1902,8 +1882,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // send a message to explicitly release all VGPRs before the stores have
   // completed, but it is only safe to do this if there are no outstanding
   // scratch stores.
-  else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
-           MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+  else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
     if (!WCG->isOptNone() &&
         (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
          (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
@@ -1912,8 +1891,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       ReleaseVGPRInsts.insert(&MI);
   }
   // Resolve vm waits before gs-done.
-  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
-            MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+  else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
            ST->hasLegacyGeometry() &&
            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
             AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
@@ -1938,7 +1916,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
 
     // Wait for any pending GDS instruction to complete before any
     // "Always GDS" instruction.
-    if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
+    if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
       addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
 
     if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
@@ -1964,7 +1942,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
                                       Wait);
         }
       }
-    } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
+    } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
       ScoreBrackets.tryClearSCCWriteEvent(&MI);
     } else {
       // FIXME: Should not be relying on memoperands.
@@ -2061,7 +2039,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
         }
 
-        if (hasXcnt() && Op.isDef())
+        if (ST->hasWaitXCnt() && Op.isDef())
           ScoreBrackets.determineWait(X_CNT, Interval, Wait);
       }
     }
@@ -2079,18 +2057,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   //
   // In all other cases, ensure safety by ensuring that there are no outstanding
   // memory operations.
-  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
-      !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
+  if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
+      !ST->supportsBackOffBarrier()) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
   //       after fixing the scheduler. Also, the Shader Compiler code is
   //       independent of target.
-  if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
-    if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
-      Wait.DsCnt = 0;
-    }
+  if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
+      ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+    Wait.DsCnt = 0;
   }
 
   // Verify that the wait is actually needed.
@@ -2165,19 +2142,19 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
   }
 
   // XCnt may be already consumed by a load wait.
-  if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
-      !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
-    Wait.XCnt = ~0u;
+  if (Wait.XCnt != ~0u) {
+    if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+      Wait.XCnt = ~0u;
 
-  if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
-      !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
-    Wait.XCnt = ~0u;
+    if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+      Wait.XCnt = ~0u;
 
-  // Since the translation for VMEM addresses occur in-order, we can skip the
-  // XCnt if the current instruction is of VMEM type and has a memory dependency
-  // with another VMEM instruction in flight.
-  if (Wait.XCnt != ~0u && isVmemAccess(*It))
-    Wait.XCnt = ~0u;
+    // Since the translation for VMEM addresses occur in-order, we can skip the
+    // XCnt if the current instruction is of VMEM type and has a memory
+    // dependency with another VMEM instruction in flight.
+    if (isVmemAccess(*It))
+      Wait.XCnt = ~0u;
+  }
 
   if (WCG->createNewWaitcnt(Block, It, Wait))
     Modified = true;
@@ -2185,75 +2162,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
   return Modified;
 }
 
-// This is a flat memory operation. Check to see if it has memory tokens other
-// than LDS. Other address spaces supported by flat memory operations involve
-// global memory.
-bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
-  assert(TII->isFLAT(MI));
-
-  // All flat instructions use the VMEM counter except prefetch.
-  if (!TII->usesVM_CNT(MI))
-    return false;
-
-  // If there are no memory operands then conservatively assume the flat
-  // operation may access VMEM.
-  if (MI.memoperands_empty())
-    return true;
-
-  // See if any memory operand specifies an address space that involves VMEM.
-  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
-  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
-  // (GDS) address space is not supported by flat operations. Therefore, simply
-  // return true unless only the LDS address space is found.
-  for (const MachineMemOperand *Memop : MI.memoperands()) {
-    unsigned AS = Memop->getAddrSpace();
-    assert(AS != AMDGPUAS::REGION_ADDRESS);
-    if (AS != AMDGPUAS::LOCAL_ADDRESS)
-      return true;
-  }
-
-  return false;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either LDS or FLAT.
-bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
-  assert(TII->isFLAT(MI));
-
-  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
-  if (!TII->usesLGKM_CNT(MI))
-    return false;
-
-  // If in tgsplit mode then there can be no use of LDS.
-  if (ST->isTgSplitEnabled())
-    return false;
-
-  // If there are no memory operands then conservatively assume the flat
-  // operation may access LDS.
-  if (MI.memoperands_empty())
-    return true;
-
-  // See if any memory operand specifies an address space that involves LDS.
-  for (const MachineMemOperand *Memop : MI.memoperands()) {
-    unsigned AS = Memop->getAddrSpace();
-    if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
-      return true;
-  }
-
-  return false;
-}
-
 bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
-  return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+  return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
          (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
 }
 
-static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
-  auto Opc = Inst.getOpcode();
-  return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
-         Opc == AMDGPU::GLOBAL_WBINV;
-}
-
 // Return true if the next instruction is S_ENDPGM, following fallthrough
 // blocks if necessary.
 bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -2331,7 +2244,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
     }
   } else if (TII->isFLAT(Inst)) {
-    if (isGFX12CacheInvOrWBInst(Inst)) {
+    if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
                                    Inst);
       return;
@@ -2341,14 +2254,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
 
     int FlatASCount = 0;
 
-    if (mayAccessVMEMThroughFlat(Inst)) {
+    if (TII->mayAccessVMEMThroughFlat(Inst)) {
       ++FlatASCount;
       IsVMEMAccess = true;
       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
                                    Inst);
     }
 
-    if (mayAccessLDSThroughFlat(Inst)) {
+    if (TII->mayAccessLDSThroughFlat(Inst)) {
       ++FlatASCount;
       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
     }
@@ -2394,7 +2307,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
     else
       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
-  } else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
+  } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
     ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
   } else {
     switch (Inst.getOpcode()) {
@@ -2413,7 +2326,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     }
   }
 
-  if (!hasXcnt())
+  if (!ST->hasWaitXCnt())
     return;
 
   if (IsVMEMAccess)
@@ -2478,9 +2391,8 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
         unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
         if (!OldEventsHasSCCWrite) {
           PendingSCCWrite = Other.PendingSCCWrite;
-        } else {
-          if (PendingSCCWrite != Other.PendingSCCWrite)
-            PendingSCCWrite = nullptr;
+        } else if (PendingSCCWrite != Other.PendingSCCWrite) {
+          PendingSCCWrite = nullptr;
         }
       }
     }
@@ -2516,12 +2428,6 @@ static bool isWaitInstr(MachineInstr &Inst) {
          counterTypeForInstr(Opcode).has_value();
 }
 
-bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
-  return Opcode == AMDGPU::S_BARRIER_LEAVE ||
-         Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
-         Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
-}
-
 // Generate s_waitcnt instructions where needed.
 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
                                             MachineBasicBlock &Block,
@@ -2578,7 +2484,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     OldWaitcntInstr = nullptr;
 
     // Restore vccz if it's not known to be correct already.
-    bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
+    bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
 
     // Don't examine operands unless we need to track vccz correctness.
     if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
@@ -2701,7 +2607,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush(
 
 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
   if (SIInstrInfo::isFLAT(MI))
-    return mayAccessVMEMThroughFlat(MI);
+    return TII->mayAccessVMEMThroughFlat(MI);
   return SIInstrInfo::isVMEM(MI);
 }
 
@@ -2724,11 +2630,10 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   for (MachineBasicBlock *MBB : ML->blocks()) {
     for (MachineInstr &MI : *MBB) {
       if (isVMEMOrFlatVMEM(MI)) {
-        if (MI.mayLoad())
-          HasVMemLoad = true;
-        if (MI.mayStore())
-          HasVMemStore = true;
+        HasVMemLoad |= MI.mayLoad();
+        HasVMemStore |= MI.mayStore();
       }
+
       for (const MachineOperand &Op : MI.all_uses()) {
         if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
           continue;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 044ea86..56435a5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4344,6 +4344,59 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
   });
 }
 
+bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
+  assert(isFLAT(MI));
+
+  // All flat instructions use the VMEM counter except prefetch.
+  if (!usesVM_CNT(MI))
+    return false;
+
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access VMEM.
+  if (MI.memoperands_empty())
+    return true;
+
+  // See if any memory operand specifies an address space that involves VMEM.
+  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
+  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
+  // (GDS) address space is not supported by flat operations. Therefore, simply
+  // return true unless only the LDS address space is found.
+  for (const MachineMemOperand *Memop : MI.memoperands()) {
+    unsigned AS = Memop->getAddrSpace();
+    assert(AS != AMDGPUAS::REGION_ADDRESS);
+    if (AS != AMDGPUAS::LOCAL_ADDRESS)
+      return true;
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+  assert(isFLAT(MI));
+
+  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
+  if (!usesLGKM_CNT(MI))
+    return false;
+
+  // If in tgsplit mode then there can be no use of LDS.
+  if (ST.isTgSplitEnabled())
+    return false;
+
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access LDS.
+  if (MI.memoperands_empty())
+    return true;
+
+  // See if any memory operand specifies an address space that involves LDS.
+  for (const MachineMemOperand *Memop : MI.memoperands()) {
+    unsigned AS = Memop->getAddrSpace();
+    if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
+      return true;
+  }
+
+  return false;
+}
+
 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
   // Skip the full operand and register alias search modifiesRegister
   // does. There's only a handful of instructions that touch this, it's only an
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c2252af..754f52a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -688,6 +688,12 @@ public:
   /// to not hit scratch.
   bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
 
+  /// \returns true for FLAT instructions that can access VMEM.
+  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
+
+  /// \returns true for FLAT instructions that can access LDS.
+  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
+
   static bool isBlockLoadStore(uint16_t Opcode) {
     switch (Opcode) {
     case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
@@ -748,6 +754,18 @@ public:
     return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
   }
 
+  static bool isSBarrierSCCWrite(unsigned Opcode) {
+    return Opcode == AMDGPU::S_BARRIER_LEAVE ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
+  }
+
+  static bool isCBranchVCCZRead(const MachineInstr &MI) {
+    unsigned Opc = MI.getOpcode();
+    return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+           !MI.getOperand(1).isUndef();
+  }
+
   static bool isWQM(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::WQM;
   }
@@ -1010,6 +1028,16 @@ public:
            Opcode == AMDGPU::DS_GWS_BARRIER;
   }
 
+  static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
+    return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+           Opc == AMDGPU::GLOBAL_WBINV;
+  }
+
+  static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
+    return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+           Opc == AMDGPU::GLOBAL_WBINV;
+  }
+
   static bool isF16PseudoScalarTrans(unsigned Opcode) {
     return Opcode == AMDGPU::V_S_EXP_F16_e64 ||
            Opcode == AMDGPU::V_S_LOG_F16_e64 ||
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 3329bea..58bc338 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -225,7 +225,11 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       (isTargetDarwin() || DM == DenormalMode::getPreserveSign()))
     HasNEONForFP = true;
 
-  if (isRWPI())
+  const ARM::ArchKind Arch = ARM::parseArch(TargetTriple.getArchName());
+  if (isRWPI() ||
+      (isTargetIOS() &&
+       (Arch == ARM::ArchKind::ARMV6K || Arch == ARM::ArchKind::ARMV6) &&
+       TargetTriple.isOSVersionLT(3, 0)))
     ReserveR9 = true;
 
   // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 70b6c7e..1e6b04f8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3793,6 +3793,11 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
       return false;
     // Operands 1 and 2 are commutable, if we switch the opcode.
     return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+  case RISCV::QC_SELECTIEQ:
+  case RISCV::QC_SELECTINE:
+  case RISCV::QC_SELECTIIEQ:
+  case RISCV::QC_SELECTIINE:
+    return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
   case RISCV::QC_MVEQ:
   case RISCV::QC_MVNE:
   case RISCV::QC_MVLT:
@@ -4018,6 +4023,11 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1,
                                                    OpIdx2);
   }
+  case RISCV::QC_SELECTIEQ:
+  case RISCV::QC_SELECTINE:
+  case RISCV::QC_SELECTIIEQ:
+  case RISCV::QC_SELECTIINE:
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   case RISCV::QC_MVEQ:
   case RISCV::QC_MVNE:
   case RISCV::QC_MVLT:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 4eb9a3be..d998316 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -345,7 +345,7 @@ defset list<VTypeInfo> AllVectors = {
     }
   }
 
-  defset list<VTypeInfo> AllFloatAndBFloatVectors = {
+  defset list<VTypeInfo> AllFloatAndBF16Vectors = {
     defset list<VTypeInfo> AllFloatVectors = {
       defset list<VTypeInfo> NoGroupFloatVectors = {
         defset list<VTypeInfo> FractionalGroupFloatVectors = {
@@ -382,16 +382,16 @@ defset list<VTypeInfo> AllVectors = {
       }
     }
 
-    defset list<VTypeInfo> AllBFloatVectors = {
-      defset list<VTypeInfo> NoGroupBFloatVectors = {
-        defset list<VTypeInfo> FractionalGroupBFloatVectors = {
+    defset list<VTypeInfo> AllBF16Vectors = {
+      defset list<VTypeInfo> NoGroupBF16Vectors = {
+        defset list<VTypeInfo> FractionalGroupBF16Vectors = {
           def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, V_MF4, bf16, FPR16>;
           def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, V_MF2, bf16, FPR16>;
         }
         def VBF16M1:  VTypeInfo<vbfloat16m1_t, vbool16_t, 16, V_M1, bf16, FPR16>;
       }
 
-      defset list<GroupVTypeInfo> GroupBFloatVectors = {
+      defset list<GroupVTypeInfo> GroupBF16Vectors = {
         def VBF16M2: GroupVTypeInfo<vbfloat16m2_t, vbfloat16m1_t, vbool8_t, 16,
                                     V_M2, bf16, FPR16>;
         def VBF16M4: GroupVTypeInfo<vbfloat16m4_t, vbfloat16m1_t, vbool4_t, 16,
@@ -542,7 +542,7 @@ defset list<VTypeInfoToWide> AllWidenableIntToFloatVectors = {
   def : VTypeInfoToWide<VI32M4, VF64M8>;
 }
 
-defset list<VTypeInfoToWide> AllWidenableBFloatToFloatVectors = {
+defset list<VTypeInfoToWide> AllWidenableBF16ToFloatVectors = {
   def : VTypeInfoToWide<VBF16MF4, VF32MF2>;
   def : VTypeInfoToWide<VBF16MF2, VF32M1>;
   def : VTypeInfoToWide<VBF16M1, VF32M2>;
@@ -5870,7 +5870,7 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction,
 
 multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
                                   bit isSEWAware = 0> {
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
   {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
@@ -5977,7 +5977,7 @@ multiclass VPatConversionVF_WF_RTZ<string intrinsic, string instruction,
 
 multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
                                      bit isSEWAware = 0> {
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
@@ -7154,7 +7154,7 @@ defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
 // We can use vmerge.vvm to support vector-vector vfmerge.
 // NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses
 // int_riscv_vmerge. Support both for compatibility.
-foreach vti = AllFloatAndBFloatVectors in {
+foreach vti = AllFloatAndBF16Vectors in {
   let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM",
                                  vti.Vector,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index dc61361..139ff92 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -1388,7 +1388,7 @@ defm : VPatFPSetCCSDNode_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">;
 // Floating-point vselects:
 // 11.15. Vector Integer Merge Instructions
 // 13.15. Vector Floating-Point Merge Instruction
-foreach fvti = AllFloatAndBFloatVectors in {
+foreach fvti = AllFloatAndBF16Vectors in {
   defvar ivti = GetIntVTypeInfo<fvti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
     def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 1511f1b..cf904ea 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2426,7 +2426,7 @@ foreach vti = AllFloatVectors in {
 // Floating-point vselects:
 // 11.15. Vector Integer Merge Instructions
 // 13.15. Vector Floating-Point Merge Instruction
-foreach fvti = AllFloatAndBFloatVectors in {
+foreach fvti = AllFloatAndBF16Vectors in {
   defvar ivti = GetIntVTypeInfo<fvti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
@@ -2770,7 +2770,7 @@ foreach vti = NoGroupFloatVectors in {
   }
 }
 
-foreach vti = AllFloatAndBFloatVectors in {
+foreach vti = AllFloatAndBF16Vectors in {
   defvar ivti = GetIntVTypeInfo<vti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
     def : Pat<(vti.Vector
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 9835c03..b683e89 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -560,7 +560,7 @@ multiclass VPseudoVNCVT_BF16_S {
 }
 
 multiclass VPatConversionS_BF16<string intrinsic, string instruction> {
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     let Predicates = [HasVendorXAndesVBFHCvt] in
@@ -572,7 +572,7 @@ multiclass VPatConversionS_BF16<string intrinsic, string instruction> {
 }
 
 multiclass VPatConversionBF16_S<string intrinsic, string instruction> {
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     let Predicates = [HasVendorXAndesVBFHCvt] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index b546339..557d873 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -770,7 +770,7 @@ multiclass VPatVQMACCQOQ<string intrinsic, string instruction, string kind>
     : VPatVMACC<intrinsic, instruction, kind, VQMACCQOQInfoPairs, vint8m1_t>;
 
 multiclass VPatVFWMACC<string intrinsic, string instruction, string kind>
-    : VPatVMACC<intrinsic, instruction, kind, AllWidenableBFloatToFloatVectors,
+    : VPatVMACC<intrinsic, instruction, kind, AllWidenableBF16ToFloatVectors,
                 vbfloat16m1_t>;
 
 defset list<VTypeInfoToWide> VFNRCLIPInfoPairs = {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index ff4a040..5407868 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -524,7 +524,7 @@ class QCIRVInstRI<bits<1> funct1, DAGOperand InTyImm11,
   let Inst{30-20} = imm11;
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
 class QCISELECTIICC<bits<3> funct3, string opcodestr>
     : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
                (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2),
@@ -537,7 +537,7 @@ class QCISELECTIICC<bits<3> funct3, string opcodestr>
   let rs2 = simm1;
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
 class QCISELECTICC<bits<3> funct3, string opcodestr>
     : RVInstR4<0b01, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
                (ins GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm2),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index 6d8672b..0be9eab 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -53,7 +53,7 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
   defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w",
                                    "PseudoVFNCVTBF16_F_F", isSEWAware=1>;
 
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     let Predicates = [HasVInstructionsBF16Minimal] in
@@ -91,9 +91,9 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
 
 let Predicates = [HasStdExtZvfbfwma] in {
   defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmaccbf16", "PseudoVFWMACCBF16",
-                               AllWidenableBFloatToFloatVectors, isSEWAware=1>;
+                               AllWidenableBF16ToFloatVectors, isSEWAware=1>;
   defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmadd_vl, "PseudoVFWMACCBF16",
-                                      AllWidenableBFloatToFloatVectors>;
+                                      AllWidenableBF16ToFloatVectors>;
   defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16",
-                                          AllWidenableBFloatToFloatVectors>;
+                                          AllWidenableBF16ToFloatVectors>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 95f8a87..17a7948 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -347,16 +347,58 @@ defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
                                  TunePostRAScheduler];
 
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
-                                      !listconcat(RVA22U64Features,
-                                      [FeatureStdExtZifencei,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtZicsr,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZiccrse,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZicclsm,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtB,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtZifencei,
                                        FeatureStdExtZihintntl,
                                        FeatureUnalignedScalarMem,
-                                       FeatureUnalignedVectorMem]),
+                                       FeatureUnalignedVectorMem],
                                       SiFiveP400TuneFeatures>;
 
 def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
-                                      !listconcat(RVA22U64Features,
-                                      [FeatureStdExtV,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtZicsr,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZiccrse,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZicclsm,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtB,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtV,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZihintntl,
                                        FeatureStdExtZvl128b,
@@ -368,7 +410,7 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
                                        FeatureVendorXSiFivecdiscarddlone,
                                        FeatureVendorXSiFivecflushdlone,
                                        FeatureUnalignedScalarMem,
-                                       FeatureUnalignedVectorMem]),
+                                       FeatureUnalignedVectorMem],
                                       !listconcat(SiFiveP400TuneFeatures,
                                                   [TuneNoSinkSplatOperands,
                                                    TuneVXRMPipelineFlush])>;
@@ -397,8 +439,29 @@ def SIFIVE_P550 : RISCVProcessorModel<"sifive-p550", SiFiveP500Model,
 }
 
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
-                                      !listconcat(RVA22U64Features,
-                                      [FeatureStdExtV,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtZicsr,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZiccrse,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZicclsm,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtB,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtV,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZihintntl,
                                        FeatureStdExtZvl128b,
@@ -408,7 +471,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                        FeatureStdExtZvksc,
                                        FeatureStdExtZvksg,
                                        FeatureUnalignedScalarMem,
-                                       FeatureUnalignedVectorMem]),
+                                       FeatureUnalignedVectorMem],
                                       [TuneNoDefaultUnroll,
                                        TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index f88d51f..99c4982 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1680,7 +1680,9 @@ processGlobal(GlobalValue &GV,
 /// FastCC.
 static void ChangeCalleesToFastCall(Function *F) {
   for (User *U : F->users())
-    cast<CallBase>(U)->setCallingConv(CallingConv::Fast);
+    if (auto *Call = dyn_cast<CallBase>(U))
+      if (Call->getCalledOperand() == F)
+        Call->setCallingConv(CallingConv::Fast);
 }
 
 static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
@@ -1766,10 +1768,12 @@ isValidCandidateForColdCC(Function &F,
     return false;
 
   for (User *U : F.users()) {
-    CallBase &CB = cast<CallBase>(*U);
-    Function *CallerFunc = CB.getParent()->getParent();
+    CallBase *CB = dyn_cast<CallBase>(U);
+    if (!CB || CB->getCalledOperand() != &F)
+      continue;
+    Function *CallerFunc = CB->getParent()->getParent();
     BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
-    if (!isColdCallSite(CB, CallerBFI))
+    if (!isColdCallSite(*CB, CallerBFI))
       return false;
     if (!llvm::is_contained(AllCallsCold, CallerFunc))
       return false;
@@ -1779,7 +1783,9 @@ isValidCandidateForColdCC(Function &F,
 
 static void changeCallSitesToColdCC(Function *F) {
   for (User *U : F->users())
-    cast<CallBase>(U)->setCallingConv(CallingConv::Cold);
+    if (auto *Call = dyn_cast<CallBase>(U))
+      if (Call->getCalledOperand() == F)
+        Call->setCallingConv(CallingConv::Cold);
 }
 
 // This function iterates over all the call instructions in the input Function
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 8fbaf68..ff063f9 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5169,6 +5169,7 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
   // - or: pick -1
   // - select's condition: if the true value is constant, choose it by making
   //                       the condition true.
+  // - phi: pick the common constant across operands
   // - default: pick 0
   //
   // Note that this transform is intentionally done here rather than
@@ -5179,9 +5180,32 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
   // TODO: This could use getBinopAbsorber() / getBinopIdentity() to avoid
   //       duplicating logic for binops at least.
   auto getUndefReplacement = [&](Type *Ty) {
-    Value *BestValue = nullptr;
+    auto pickCommonConstantFromPHI = [](PHINode &PN) -> Value * {
+      // phi(freeze(undef), C, C). Choose C for freeze so the PHI can be
+      // removed.
+      Constant *BestValue = nullptr;
+      for (Value *V : PN.incoming_values()) {
+        if (match(V, m_Freeze(m_Undef())))
+          continue;
+
+        Constant *C = dyn_cast<Constant>(V);
+        if (!C)
+          return nullptr;
+
+        if (!isGuaranteedNotToBeUndefOrPoison(C))
+          return nullptr;
+
+        if (BestValue && BestValue != C)
+          return nullptr;
+
+        BestValue = C;
+      }
+      return BestValue;
+    };
+
     Value *NullValue = Constant::getNullValue(Ty);
-    for (const auto *U : I.users()) {
+    Value *BestValue = nullptr;
+    for (auto *U : I.users()) {
       Value *V = NullValue;
       if (match(U, m_Or(m_Value(), m_Value())))
         V = ConstantInt::getAllOnesValue(Ty);
@@ -5190,6 +5214,9 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
       else if (match(U, m_c_Select(m_Specific(&I), m_Value(V)))) {
         if (!isGuaranteedNotToBeUndefOrPoison(V, &AC, &I, &DT))
           V = NullValue;
+      } else if (auto *PHI = dyn_cast<PHINode>(U)) {
+        if (Value *MaybeV = pickCommonConstantFromPHI(*PHI))
+          V = MaybeV;
       }
 
       if (!BestValue)
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 123881e..21b2652 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3025,6 +3025,12 @@ static void combineMetadata(Instruction *K, const Instruction *J,
         // Preserve !nosanitize if both K and J have it.
         K->setMetadata(Kind, JMD);
         break;
+      case LLVMContext::MD_captures:
+        K->setMetadata(
+            Kind, MDNode::fromCaptureComponents(
+                      K->getContext(), MDNode::toCaptureComponents(JMD) |
+                                           MDNode::toCaptureComponents(KMD)));
+        break;
       }
   }
   // Set !invariant.group from J if J has it. If both instructions have it