281 files changed, 12114 insertions, 5686 deletions
diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h
new file mode 100644
index 0000000..eb56629
--- /dev/null
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -0,0 +1,181 @@
+//===- bolt/Core/MCInstUtils.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_CORE_MCINSTUTILS_H
+#define BOLT_CORE_MCINSTUTILS_H
+
+#include "bolt/Core/BinaryBasicBlock.h"
+#include <map>
+#include <variant>
+
+namespace llvm {
+class MCCodeEmitter;
+}
+
+namespace llvm {
+namespace bolt {
+
+class BinaryFunction;
+
+/// MCInstReference represents a reference to a constant MCInst as stored either
+/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock
+/// (after a CFG is created).
+///
+/// The reference may be invalidated when the function containing the referenced
+/// instruction is modified.
+class MCInstReference {
+public:
+  using nocfg_const_iterator = std::map<uint32_t, MCInst>::const_iterator;
+
+  /// Constructs an empty reference.
+  MCInstReference() : Reference(RefInBB(nullptr, /*Index=*/0)) {}
+
+  /// Constructs a reference to the instruction inside the basic block.
+  MCInstReference(const BinaryBasicBlock &BB, const MCInst &Inst)
+      : Reference(RefInBB(&BB, getInstIndexInBB(BB, Inst))) {}
+  /// Constructs a reference to the instruction inside the basic block.
+  MCInstReference(const BinaryBasicBlock &BB, unsigned Index)
+      : Reference(RefInBB(&BB, Index)) {}
+
+  /// Constructs a reference to the instruction inside the function without
+  /// CFG information.
+  MCInstReference(const BinaryFunction &BF, nocfg_const_iterator It)
+      : Reference(RefInBF(&BF, It)) {}
+
+  /// Locates an instruction inside a function and returns a reference.
+  static MCInstReference get(const MCInst &Inst, const BinaryFunction &BF);
+
+  bool operator==(const MCInstReference &Other) const {
+    return Reference == Other.Reference;
+  }
+
+  const MCInst &getMCInst() const {
+    assert(!empty() && "Empty reference");
+    if (auto *Ref = tryGetRefInBB()) {
+      [[maybe_unused]] unsigned NumInstructions = Ref->BB->size();
+      assert(Ref->Index < NumInstructions && "Invalid reference");
+      return Ref->BB->getInstructionAtIndex(Ref->Index);
+    }
+    return getRefInBF().It->second;
+  }
+
+  operator const MCInst &() const { return getMCInst(); }
+
+  bool empty() const {
+    if (auto *Ref = tryGetRefInBB())
+      return Ref->BB == nullptr;
+    return getRefInBF().BF == nullptr;
+  }
+
+  bool hasCFG() const { return !empty() && tryGetRefInBB() != nullptr; }
+
+  const BinaryFunction *getFunction() const {
+    assert(!empty() && "Empty reference");
+    if (auto *Ref = tryGetRefInBB())
+      return Ref->BB->getFunction();
+    return getRefInBF().BF;
+  }
+
+  const BinaryBasicBlock *getBasicBlock() const {
+    assert(!empty() && "Empty reference");
+    if (auto *Ref = tryGetRefInBB())
+      return Ref->BB;
+    return nullptr;
+  }
+
+  /// Computes the original address of the instruction (or offset from base
+  /// for PIC), assuming the containing function was not modified.
+  ///
+  /// This function is intended for the use cases like debug printing, as it
+  /// is only as precise as BinaryContext::computeCodeSize() is and requires
+  /// iterating over the prefix of the basic block (when CFG is available).
+  ///
+  /// MCCodeEmitter is not thread safe and the default instance from
+  /// BinaryContext is used by default, thus pass an instance explicitly if
+  /// this function may be called from multithreaded code.
+  uint64_t computeAddress(const MCCodeEmitter *Emitter = nullptr) const;
+
+  raw_ostream &print(raw_ostream &OS) const;
+
+private:
+  static unsigned getInstIndexInBB(const BinaryBasicBlock &BB,
+                                   const MCInst &Inst) {
+    // Usage of pointer arithmetic assumes the instructions are stored in a
+    // vector, see BasicBlockStorageIsVector in MCInstUtils.cpp.
+    const MCInst *FirstInstInBB = &*BB.begin();
+    return &Inst - FirstInstInBB;
+  }
+
+  // Two cases are possible:
+  // * functions with CFG reconstructed - a function stores a collection of
+  //   basic blocks, each basic block stores a contiguous vector of MCInst
+  // * functions without CFG - there are no basic blocks created,
+  //   the instructions are directly stored in std::map in BinaryFunction
+  //
+  // In both cases, the direct parent of MCInst is stored together with an
+  // index or iterator pointing to the instruction.
+
+  // Helper struct: CFG is available, the direct parent is a basic block.
+  struct RefInBB {
+    RefInBB(const BinaryBasicBlock *BB, unsigned Index)
+        : BB(BB), Index(Index) {}
+    RefInBB(const RefInBB &Other) = default;
+    RefInBB &operator=(const RefInBB &Other) = default;
+
+    const BinaryBasicBlock *BB;
+    unsigned Index;
+
+    bool operator==(const RefInBB &Other) const {
+      return BB == Other.BB && Index == Other.Index;
+    }
+  };
+
+  // Helper struct: CFG is *not* available, the direct parent is a function,
+  // iterator's type is std::map<uint32_t, MCInst>::iterator (the mapped value
+  // is an instruction's offset).
+  struct RefInBF {
+    RefInBF(const BinaryFunction *BF, nocfg_const_iterator It)
+        : BF(BF), It(It) {}
+    RefInBF(const RefInBF &Other) = default;
+    RefInBF &operator=(const RefInBF &Other) = default;
+
+    const BinaryFunction *BF;
+    nocfg_const_iterator It;
+
+    bool operator==(const RefInBF &Other) const {
+      return BF == Other.BF && It->first == Other.It->first;
+    }
+  };
+
+  std::variant<RefInBB, RefInBF> Reference;
+
+  // Utility methods to be used like this:
+  //
+  //     if (auto *Ref = tryGetRefInBB())
+  //       return Ref->doSomething(...);
+  //     return getRefInBF().doSomethingElse(...);
+  const RefInBB *tryGetRefInBB() const {
+    assert(std::get_if<RefInBB>(&Reference) ||
+           std::get_if<RefInBF>(&Reference));
+    return std::get_if<RefInBB>(&Reference);
+  }
+  const RefInBF &getRefInBF() const {
+    assert(std::get_if<RefInBF>(&Reference));
+    return *std::get_if<RefInBF>(&Reference);
+  }
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS,
+                                      const MCInstReference &Ref) {
+  return Ref.print(OS);
+}
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/include/bolt/Passes/PAuthGadgetScanner.h b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
index 721fd66..cb865a7 100644
--- a/bolt/include/bolt/Passes/PAuthGadgetScanner.h
+++ b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
@@ -11,187 +11,13 @@
 
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryFunction.h"
+#include "bolt/Core/MCInstUtils.h"
 #include "bolt/Passes/BinaryPasses.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
 namespace llvm {
 namespace bolt {
-
-/// @brief  MCInstReference represents a reference to an MCInst as stored either
-/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock
-/// (after a CFG is created). It aims to store the necessary information to be
-/// able to find the specific MCInst in either the BinaryFunction or
-/// BinaryBasicBlock data structures later, so that e.g. the InputAddress of
-/// the corresponding instruction can be computed.
-
-struct MCInstInBBReference {
-  BinaryBasicBlock *BB;
-  int64_t BBIndex;
-  MCInstInBBReference(BinaryBasicBlock *BB, int64_t BBIndex)
-      : BB(BB), BBIndex(BBIndex) {}
-  MCInstInBBReference() : BB(nullptr), BBIndex(0) {}
-  static MCInstInBBReference get(const MCInst *Inst, BinaryFunction &BF) {
-    for (BinaryBasicBlock &BB : BF)
-      for (size_t I = 0; I < BB.size(); ++I)
-        if (Inst == &BB.getInstructionAtIndex(I))
-          return MCInstInBBReference(&BB, I);
-    return {};
-  }
-  bool operator==(const MCInstInBBReference &RHS) const {
-    return BB == RHS.BB && BBIndex == RHS.BBIndex;
-  }
-  bool operator<(const MCInstInBBReference &RHS) const {
-    return std::tie(BB, BBIndex) < std::tie(RHS.BB, RHS.BBIndex);
-  }
-  operator MCInst &() const {
-    assert(BB != nullptr);
-    return BB->getInstructionAtIndex(BBIndex);
-  }
-  uint64_t getAddress() const {
-    // 4 bytes per instruction on AArch64.
-    // FIXME: the assumption of 4 byte per instruction needs to be fixed before
-    // this method gets used on any non-AArch64 binaries (but should be fine for
-    // pac-ret analysis, as that is an AArch64-specific feature).
-    return BB->getFunction()->getAddress() + BB->getOffset() + BBIndex * 4;
-  }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &);
-
-struct MCInstInBFReference {
-  BinaryFunction *BF;
-  uint64_t Offset;
-  MCInstInBFReference(BinaryFunction *BF, uint64_t Offset)
-      : BF(BF), Offset(Offset) {}
-
-  static MCInstInBFReference get(const MCInst *Inst, BinaryFunction &BF) {
-    for (auto &I : BF.instrs())
-      if (Inst == &I.second)
-        return MCInstInBFReference(&BF, I.first);
-    return {};
-  }
-
-  MCInstInBFReference() : BF(nullptr), Offset(0) {}
-  bool operator==(const MCInstInBFReference &RHS) const {
-    return BF == RHS.BF && Offset == RHS.Offset;
-  }
-  bool operator<(const MCInstInBFReference &RHS) const {
-    return std::tie(BF, Offset) < std::tie(RHS.BF, RHS.Offset);
-  }
-  operator MCInst &() const {
-    assert(BF != nullptr);
-    return *BF->getInstructionAtOffset(Offset);
-  }
-
-  uint64_t getOffset() const { return Offset; }
-
-  uint64_t getAddress() const { return BF->getAddress() + getOffset(); }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &);
-
-struct MCInstReference {
-  enum Kind { FunctionParent, BasicBlockParent };
-  Kind ParentKind;
-  union U {
-    MCInstInBBReference BBRef;
-    MCInstInBFReference BFRef;
-    U(MCInstInBBReference BBRef) : BBRef(BBRef) {}
-    U(MCInstInBFReference BFRef) : BFRef(BFRef) {}
-  } U;
-  MCInstReference(MCInstInBBReference BBRef)
-      : ParentKind(BasicBlockParent), U(BBRef) {}
-  MCInstReference(MCInstInBFReference BFRef)
-      : ParentKind(FunctionParent), U(BFRef) {}
-  MCInstReference(BinaryBasicBlock *BB, int64_t BBIndex)
-      : MCInstReference(MCInstInBBReference(BB, BBIndex)) {}
-  MCInstReference(BinaryFunction *BF, uint32_t Offset)
-      : MCInstReference(MCInstInBFReference(BF, Offset)) {}
-
-  static MCInstReference get(const MCInst *Inst, BinaryFunction &BF) {
-    if (BF.hasCFG())
-      return MCInstInBBReference::get(Inst, BF);
-    return MCInstInBFReference::get(Inst, BF);
-  }
-
-  bool operator<(const MCInstReference &RHS) const {
-    if (ParentKind != RHS.ParentKind)
-      return ParentKind < RHS.ParentKind;
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef < RHS.U.BBRef;
-    case FunctionParent:
-      return U.BFRef < RHS.U.BFRef;
-    }
-    llvm_unreachable("");
-  }
-
-  bool operator==(const MCInstReference &RHS) const {
-    if (ParentKind != RHS.ParentKind)
-      return false;
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef == RHS.U.BBRef;
-    case FunctionParent:
-      return U.BFRef == RHS.U.BFRef;
-    }
-    llvm_unreachable("");
-  }
-
-  operator MCInst &() const {
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef;
-    case FunctionParent:
-      return U.BFRef;
-    }
-    llvm_unreachable("");
-  }
-
-  operator bool() const {
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef.BB != nullptr;
-    case FunctionParent:
-      return U.BFRef.BF != nullptr;
-    }
-    llvm_unreachable("");
-  }
-
-  uint64_t getAddress() const {
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef.getAddress();
-    case FunctionParent:
-      return U.BFRef.getAddress();
-    }
-    llvm_unreachable("");
-  }
-
-  BinaryFunction *getFunction() const {
-    switch (ParentKind) {
-    case FunctionParent:
-      return U.BFRef.BF;
-    case BasicBlockParent:
-      return U.BBRef.BB->getFunction();
-    }
-    llvm_unreachable("");
-  }
-
-  BinaryBasicBlock *getBasicBlock() const {
-    switch (ParentKind) {
-    case FunctionParent:
-      return nullptr;
-    case BasicBlockParent:
-      return U.BBRef.BB;
-    }
-    llvm_unreachable("");
-  }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &);
-
 namespace PAuthGadgetScanner {
 
 // The report classes are designed to be used in an immutable manner.
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index fc72dc0..58cfcab 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -32,6 +32,7 @@ add_llvm_library(LLVMBOLTCore
   GDBIndex.cpp
   HashUtilities.cpp
   JumpTable.cpp
+  MCInstUtils.cpp
   MCPlusBuilder.cpp
   ParallelUtilities.cpp
   Relocation.cpp
diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp
new file mode 100644
index 0000000..f505bf7
--- /dev/null
+++ b/bolt/lib/Core/MCInstUtils.cpp
@@ -0,0 +1,86 @@
+//===- bolt/Core/MCInstUtils.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/MCInstUtils.h"
+#include "bolt/Core/BinaryBasicBlock.h"
+#include "bolt/Core/BinaryFunction.h"
+
+#include <type_traits>
+
+using namespace llvm;
+using namespace llvm::bolt;
+
+// It is assumed in a few places that BinaryBasicBlock stores its instructions
+// in a contiguous vector.
+using BasicBlockStorageIsVector =
+    std::is_same<BinaryBasicBlock::const_iterator,
+                 std::vector<MCInst>::const_iterator>;
+static_assert(BasicBlockStorageIsVector::value);
+
+MCInstReference MCInstReference::get(const MCInst &Inst,
+                                     const BinaryFunction &BF) {
+  if (BF.hasCFG()) {
+    for (BinaryBasicBlock &BB : BF) {
+      for (MCInst &MI : BB)
+        if (&MI == &Inst)
+          return MCInstReference(BB, Inst);
+    }
+    llvm_unreachable("Inst is not contained in BF");
+  }
+
+  for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) {
+    if (&I->second == &Inst)
+      return MCInstReference(BF, I);
+  }
+  llvm_unreachable("Inst is not contained in BF");
+}
+
+uint64_t MCInstReference::computeAddress(const MCCodeEmitter *Emitter) const {
+  assert(!empty() && "Taking instruction address by empty reference");
+
+  const BinaryContext &BC = getFunction()->getBinaryContext();
+  if (auto *Ref = tryGetRefInBB()) {
+    const uint64_t AddressOfBB =
+        getFunction()->getAddress() + Ref->BB->getOffset();
+    const MCInst *FirstInstInBB = &*Ref->BB->begin();
+    const MCInst *ThisInst = &getMCInst();
+
+    // Usage of plain 'const MCInst *' as iterators assumes the instructions
+    // are stored in a vector, see BasicBlockStorageIsVector.
+    const uint64_t OffsetInBB =
+        BC.computeCodeSize(FirstInstInBB, ThisInst, Emitter);
+
+    return AddressOfBB + OffsetInBB;
+  }
+
+  auto &Ref = getRefInBF();
+  const uint64_t OffsetInBF = Ref.It->first;
+
+  return getFunction()->getAddress() + OffsetInBF;
+}
+
+raw_ostream &MCInstReference::print(raw_ostream &OS) const {
+  if (const RefInBB *Ref = tryGetRefInBB()) {
+    OS << "MCInstBBRef<";
+    if (Ref->BB == nullptr)
+      OS << "BB:(null)";
+    else
+      OS << "BB:" << Ref->BB->getName() << ":" << Ref->Index;
+    OS << ">";
+    return OS;
+  }
+
+  const RefInBF &Ref = getRefInBF();
+  OS << "MCInstBFRef<";
+  if (Ref.BF == nullptr)
+    OS << "BF:(null)";
+  else
+    OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.It->first;
+  OS << ">";
+  return OS;
+}
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 65c84eb..cfe4b6b 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -24,39 +24,6 @@
 
 namespace llvm {
 namespace bolt {
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &Ref) {
-  OS << "MCInstBBRef<";
-  if (Ref.BB == nullptr)
-    OS << "BB:(null)";
-  else
-    OS << "BB:" << Ref.BB->getName() << ":" << Ref.BBIndex;
-  OS << ">";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &Ref) {
-  OS << "MCInstBFRef<";
-  if (Ref.BF == nullptr)
-    OS << "BF:(null)";
-  else
-    OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.getOffset();
-  OS << ">";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &Ref) {
-  switch (Ref.ParentKind) {
-  case MCInstReference::BasicBlockParent:
-    OS << Ref.U.BBRef;
-    return OS;
-  case MCInstReference::FunctionParent:
-    OS << Ref.U.BFRef;
-    return OS;
-  }
-  llvm_unreachable("");
-}
-
 namespace PAuthGadgetScanner {
 
 [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef Label,
@@ -91,10 +58,10 @@ template <typename T> static void iterateOverInstrs(BinaryFunction &BF, T Fn) {
   if (BF.hasCFG()) {
     for (BinaryBasicBlock &BB : BF)
       for (int64_t I = 0, E = BB.size(); I < E; ++I)
-        Fn(MCInstInBBReference(&BB, I));
+        Fn(MCInstReference(BB, I));
   } else {
-    for (auto I : BF.instrs())
-      Fn(MCInstInBFReference(&BF, I.first));
+    for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I)
+      Fn(MCInstReference(BF, I));
   }
 }
 
@@ -564,11 +531,8 @@ public:
     const SrcState &S = getStateBefore(Inst);
 
     std::vector<MCInstReference> Result;
-    for (const MCInst *Inst : lastWritingInsts(S, ClobberedReg)) {
-      MCInstReference Ref = MCInstReference::get(Inst, BF);
-      assert(Ref && "Expected Inst to be found");
-      Result.push_back(Ref);
-    }
+    for (const MCInst *Inst : lastWritingInsts(S, ClobberedReg))
+      Result.push_back(MCInstReference::get(*Inst, BF));
     return Result;
   }
 };
@@ -1136,11 +1100,8 @@ public:
     const DstState &S = getStateAfter(Inst);
 
     std::vector<MCInstReference> Result;
-    for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg)) {
-      MCInstReference Ref = MCInstReference::get(Inst, BF);
-      assert(Ref && "Expected Inst to be found");
-      Result.push_back(Ref);
-    }
+    for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg))
+      Result.push_back(MCInstReference::get(*Inst, BF));
     return Result;
   }
 };
@@ -1345,8 +1306,7 @@ static bool shouldAnalyzeTailCallInst(const BinaryContext &BC,
   // (such as isBranch at the time of writing this comment), some don't (such
   // as isCall). For that reason, call MCInstrDesc's methods explicitly when
   // it is important.
-  const MCInstrDesc &Desc =
-      BC.MII->get(static_cast<const MCInst &>(Inst).getOpcode());
+  const MCInstrDesc &Desc = BC.MII->get(Inst.getMCInst().getOpcode());
   // Tail call should be a branch (but not necessarily an indirect one).
   if (!Desc.isBranch())
     return false;
@@ -1541,7 +1501,7 @@ void FunctionAnalysisContext::findUnsafeUses(
       // This is printed as "[message] in function [name], basic block ...,
       // at address ..." when the issue is reported to the user.
       Reports.push_back(make_generic_report(
-          MCInstReference::get(FirstInst, BF),
+          MCInstReference(BB, *FirstInst),
           "Warning: possibly imprecise CFG, the analysis quality may be "
           "degraded in this function. According to BOLT, unreachable code is "
           "found" /* in function [name]... */));
@@ -1705,48 +1665,44 @@ void Analysis::runOnFunction(BinaryFunction &BF,
   }
 }
 
-static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB,
+static void printBB(const BinaryContext &BC, const BinaryBasicBlock &BB,
                     size_t StartIndex = 0, size_t EndIndex = -1) {
   if (EndIndex == (size_t)-1)
-    EndIndex = BB->size() - 1;
-  const BinaryFunction *BF = BB->getFunction();
+    EndIndex = BB.size() - 1;
+  const BinaryFunction *BF = BB.getFunction();
   for (unsigned I = StartIndex; I <= EndIndex; ++I) {
-    // FIXME: this assumes all instructions are 4 bytes in size. This is true
-    // for AArch64, but it might be good to extract this function so it can be
-    // used elsewhere and for other targets too.
-    uint64_t Address = BB->getOffset() + BF->getAddress() + 4 * I;
-    const MCInst &Inst = BB->getInstructionAtIndex(I);
+    MCInstReference Inst(BB, I);
     if (BC.MIB->isCFI(Inst))
       continue;
-    BC.printInstruction(outs(), Inst, Address, BF);
+    BC.printInstruction(outs(), Inst, Inst.computeAddress(), BF);
   }
 }
 
 static void reportFoundGadgetInSingleBBSingleRelatedInst(
     raw_ostream &OS, const BinaryContext &BC, const MCInstReference RelatedInst,
     const MCInstReference Location) {
-  BinaryBasicBlock *BB = Location.getBasicBlock();
-  assert(RelatedInst.ParentKind == MCInstReference::BasicBlockParent);
-  assert(Location.ParentKind == MCInstReference::BasicBlockParent);
-  MCInstInBBReference RelatedInstBB = RelatedInst.U.BBRef;
-  if (BB == RelatedInstBB.BB) {
+  const BinaryBasicBlock *BB = Location.getBasicBlock();
+  assert(RelatedInst.hasCFG());
+  assert(Location.hasCFG());
+  if (BB == RelatedInst.getBasicBlock()) {
     OS << "  This happens in the following basic block:\n";
-    printBB(BC, BB);
+    printBB(BC, *BB);
   }
 }
 
 void Diagnostic::printBasicInfo(raw_ostream &OS, const BinaryContext &BC,
                                 StringRef IssueKind) const {
-  BinaryFunction *BF = Location.getFunction();
-  BinaryBasicBlock *BB = Location.getBasicBlock();
+  const BinaryBasicBlock *BB = Location.getBasicBlock();
+  const BinaryFunction *BF = Location.getFunction();
+  const uint64_t Address = Location.computeAddress();
 
   OS << "\nGS-PAUTH: " << IssueKind;
   OS << " in function " << BF->getPrintName();
   if (BB)
     OS << ", basic block " << BB->getName();
-  OS << ", at address " << llvm::format("%x", Location.getAddress()) << "\n";
+  OS << ", at address " << llvm::format("%x", Address) << "\n";
   OS << "  The instruction is ";
-  BC.printInstruction(OS, Location, Location.getAddress(), BF);
+  BC.printInstruction(OS, Location, Address, BF);
 }
 
 void GadgetDiagnostic::generateReport(raw_ostream &OS,
@@ -1760,21 +1716,23 @@ static void printRelatedInstrs(raw_ostream &OS, const MCInstReference Location,
   const BinaryContext &BC = BF.getBinaryContext();
 
   // Sort by address to ensure output is deterministic.
-  SmallVector<MCInstReference> RI(RelatedInstrs);
-  llvm::sort(RI, [](const MCInstReference &A, const MCInstReference &B) {
-    return A.getAddress() < B.getAddress();
-  });
+  SmallVector<std::pair<uint64_t, MCInstReference>> RI;
+  for (auto &InstRef : RelatedInstrs)
+    RI.push_back(std::make_pair(InstRef.computeAddress(), InstRef));
+  llvm::sort(RI, [](auto A, auto B) { return A.first < B.first; });
+
   for (unsigned I = 0; I < RI.size(); ++I) {
-    MCInstReference InstRef = RI[I];
+    auto [Address, InstRef] = RI[I];
     OS << "  " << (I + 1) << ". ";
-    BC.printInstruction(OS, InstRef, InstRef.getAddress(), &BF);
+    BC.printInstruction(OS, InstRef, Address, &BF);
   };
+
   if (RelatedInstrs.size() == 1) {
     const MCInstReference RelatedInst = RelatedInstrs[0];
     // Printing the details for the MCInstReference::FunctionParent case
     // is not implemented not to overcomplicate the code, as most functions
     // are expected to have CFG information.
-    if (RelatedInst.ParentKind == MCInstReference::BasicBlockParent)
+    if (RelatedInst.hasCFG())
       reportFoundGadgetInSingleBBSingleRelatedInst(OS, BC, RelatedInst,
                                                    Location);
   }
diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
index 1e65788..4fc1b3b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "InvalidEnumDefaultInitializationCheck.h"
+#include "../utils/Matchers.h"
+#include "../utils/OptionsUtils.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/TypeVisitor.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -88,12 +90,24 @@ public:
 
 InvalidEnumDefaultInitializationCheck::InvalidEnumDefaultInitializationCheck(
     StringRef Name, ClangTidyContext *Context)
-    : ClangTidyCheck(Name, Context) {}
+    : ClangTidyCheck(Name, Context),
+      IgnoredEnums(
+          utils::options::parseStringList(Options.get("IgnoredEnums", ""))) {
+  IgnoredEnums.emplace_back("::std::errc");
+}
+
+void InvalidEnumDefaultInitializationCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IgnoredEnums",
+                utils::options::serializeStringList(IgnoredEnums));
+}
 
 void InvalidEnumDefaultInitializationCheck::registerMatchers(
     MatchFinder *Finder) {
-  auto EnumWithoutZeroValue = enumType(
-      hasDeclaration(enumDecl(isCompleteAndHasNoZeroValue()).bind("enum")));
+  auto EnumWithoutZeroValue = enumType(hasDeclaration(
+      enumDecl(isCompleteAndHasNoZeroValue(),
+               unless(matchers::matchesAnyListedName(IgnoredEnums)))
+          .bind("enum")));
   auto EnumOrArrayOfEnum = qualType(hasUnqualifiedDesugaredType(
       anyOf(EnumWithoutZeroValue,
             arrayType(hasElementType(qualType(
diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h
index 4f1a4a2..5e2662f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h
@@ -24,6 +24,10 @@ public:
                                         ClangTidyContext *Context);
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+
+private:
+  std::vector<StringRef> IgnoredEnums;
 };
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp
index 80391fe..c9da98e 100644
--- a/clang-tools-extra/clangd/CompileCommands.cpp
+++ b/clang-tools-extra/clangd/CompileCommands.cpp
@@ -270,7 +270,8 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   if (auto *DashDash =
           ArgList.getLastArgNoClaim(driver::options::OPT__DASH_DASH)) {
     auto DashDashIndex = DashDash->getIndex() + 1; // +1 accounts for Cmd[0]
-    for (unsigned I = DashDashIndex; I < Cmd.size(); ++I)
+    // Another +1 so we don't treat the `--` itself as an input.
+    for (unsigned I = DashDashIndex + 1; I < Cmd.size(); ++I)
       SawInput(Cmd[I]);
     Cmd.resize(DashDashIndex);
   }
diff --git a/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp b/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
index 2ce2975..660540a 100644
--- a/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
@@ -526,6 +526,25 @@ TEST(CommandMangler, RespectsOriginalSysroot) {
                 Not(HasSubstr(testPath("fake/sysroot"))));
   }
 }
+
+TEST(CommandMangler, StdLatestFlag) {
+  const auto Mangler = CommandMangler::forTests();
+  tooling::CompileCommand Cmd;
+  Cmd.CommandLine = {"clang-cl", "/std:c++latest", "--", "/Users/foo.cc"};
+  Mangler(Cmd, "/Users/foo.cc");
+  // Check that the /std:c++latest flag is not dropped
+  EXPECT_THAT(llvm::join(Cmd.CommandLine, " "), HasSubstr("/std:c++latest"));
+}
+
+TEST(CommandMangler, StdLatestFlag_Inference) {
+  const auto Mangler = CommandMangler::forTests();
+  tooling::CompileCommand Cmd;
+  Cmd.CommandLine = {"clang-cl", "/std:c++latest", "--", "/Users/foo.cc"};
+  Mangler(Cmd, "/Users/foo.hpp");
+  // Check that the /std:c++latest flag is not dropped during inference
+  EXPECT_THAT(llvm::join(Cmd.CommandLine, " "), HasSubstr("/std:c++latest"));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index c3a6d2f..62e1987 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -253,6 +253,10 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/infinite-loop>` check by adding detection for
   variables introduced by structured bindings.
 
+- Improved :doc:`bugprone-invalid-enum-default-initialization
+  <clang-tidy/checks/bugprone/invalid-enum-default-initialization>` with new
+  `IgnoredEnums` option to ignore specified enums during analysis.
+
 - Improved :doc:`bugprone-narrowing-conversions
   <clang-tidy/checks/bugprone/narrowing-conversions>` check by fixing
   false positive from analysis of a conditional expression in C.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst
index a3bd2b6..45cb878 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst
@@ -19,6 +19,9 @@ The check emits a warning only if an ``enum`` variable is default-initialized
 value of 0. The type can be a scoped or non-scoped ``enum``. Unions are not
 handled by the check (if it contains a member of enumeration type).
 
+Note that the ``enum`` ``std::errc`` is always ignored because it is expected to
+be default initialized, despite not defining an enumerator with the value 0.
+
 .. code-block:: c++
 
   enum class Enum1: int {
@@ -70,3 +73,12 @@ enum type) are set to 0.
   enum Enum1 Array3[2][2] = {{Enum1_A, Enum1_A}}; // warn: elements of second array are initialized to 0
 
   struct Struct1 S1 = {1}; // warn: element 'b' is initialized to 0
+
+
+Options
+-------
+
+.. option:: IgnoredEnums
+
+  Semicolon-separated list of regexes specifying enums for which this check won't be
+  enforced. Default is `::std::errc`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/invalid-enum-default-initialization.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/invalid-enum-default-initialization.cpp
index eb3d563..85ff481 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/invalid-enum-default-initialization.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/invalid-enum-default-initialization.cpp
@@ -1,4 +1,5 @@
-// RUN: %check_clang_tidy -std=c++17 %s bugprone-invalid-enum-default-initialization %t
+// RUN: %check_clang_tidy -check-suffixes=,DEFAULT -std=c++17-or-later %s bugprone-invalid-enum-default-initialization %t
+// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-invalid-enum-default-initialization %t -- -config="{CheckOptions: {bugprone-invalid-enum-default-initialization.IgnoredEnums: '::MyEnum'}}"
 
 enum class Enum0: int {
   A = 0,
@@ -24,10 +25,10 @@ Enum0 E0_6{Enum0::B};
 
 Enum1 E1_1{};
 // CHECK-NOTES: :[[@LINE-1]]:11: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 Enum1 E1_2 = Enum1();
 // CHECK-NOTES: :[[@LINE-1]]:14: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 Enum1 E1_3;
 Enum1 E1_4{0};
 Enum1 E1_5{Enum1::A};
@@ -35,44 +36,44 @@ Enum1 E1_6{Enum1::B};
 
 Enum2 E2_1{};
 // CHECK-NOTES: :[[@LINE-1]]:11: warning: enum value of type 'Enum2' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 Enum2 E2_2 = Enum2();
 // CHECK-NOTES: :[[@LINE-1]]:14: warning: enum value of type 'Enum2' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 
 void f1() {
   static Enum1 S; // FIMXE: warn for this?
   Enum1 A;
   Enum1 B = Enum1();
   // CHECK-NOTES: :[[@LINE-1]]:13: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   int C = int();
 }
 
 void f2() {
   Enum1 A{};
   // CHECK-NOTES: :[[@LINE-1]]:10: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 B = Enum1();
   // CHECK-NOTES: :[[@LINE-1]]:13: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 C[5] = {{}};
   // CHECK-NOTES: :[[@LINE-1]]:16: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   // CHECK-NOTES: :[[@LINE-3]]:17: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 D[5] = {}; // FIMXE: warn for this?
   // CHECK-NOTES: :[[@LINE-1]]:16: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
 }
 
 struct S1 {
   Enum1 E_1{};
   // CHECK-NOTES: :[[@LINE-1]]:12: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 E_2 = Enum1();
   // CHECK-NOTES: :[[@LINE-1]]:15: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 E_3;
   Enum1 E_4;
   Enum1 E_5;
@@ -80,10 +81,10 @@ struct S1 {
   S1() :
     E_3{},
     // CHECK-NOTES: :[[@LINE-1]]:8: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-    // CHECK-NOTES: :8:12: note: enum is defined here
+    // CHECK-NOTES: :9:12: note: enum is defined here
     E_4(),
     // CHECK-NOTES: :[[@LINE-1]]:8: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-    // CHECK-NOTES: :8:12: note: enum is defined here
+    // CHECK-NOTES: :9:12: note: enum is defined here
     E_5{Enum1::B}
   {}
 };
@@ -110,22 +111,22 @@ struct S5 {
 
 S2 VarS2{};
 // CHECK-NOTES: :[[@LINE-1]]:9: warning: enum value of type 'Enum1' initialized with invalid value of 0
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 // CHECK-NOTES: :[[@LINE-3]]:9: warning: enum value of type 'Enum2' initialized with invalid value of 0
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 S3 VarS3{};
 // CHECK-NOTES: :[[@LINE-1]]:10: warning: enum value of type 'Enum1' initialized with invalid value of 0
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 // CHECK-NOTES: :[[@LINE-3]]:10: warning: enum value of type 'Enum2' initialized with invalid value of 0
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 S4 VarS4{};
 // CHECK-NOTES: :[[@LINE-1]]:10: warning: enum value of type 'Enum1' initialized with invalid value of 0
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 // CHECK-NOTES: :[[@LINE-3]]:10: warning: enum value of type 'Enum2' initialized with invalid value of 0
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 S5 VarS5{};
 // CHECK-NOTES: :[[@LINE-1]]:10: warning: enum value of type 'Enum1' initialized with invalid value of 0
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 
 enum class EnumFwd;
 
@@ -139,7 +140,25 @@ template<typename T>
 struct Templ {
   T Mem1{};
   // CHECK-NOTES: :[[@LINE-1]]:9: warning: enum value of type 'Enum1' initialized with invalid value of 0
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
 };
 
 Templ<Enum1> TemplVar;
+
+enum MyEnum {
+  A = 1,
+  B
+};
+
+MyEnum MyEnumVar{};
+// CHECK-NOTES-DEFAULT: :[[@LINE-1]]:17: warning: enum value of type 'MyEnum' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
+// CHECK-NOTES-DEFAULT: :148:6: note: enum is defined here
+
+namespace std {
+  enum errc {
+    A = 1,
+    B
+  };
+}
+
+std::errc err{};
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index b503283..6bb99c7 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2065,9 +2065,9 @@ The following type trait primitives are supported by Clang. Those traits marked
     Returns true if a reference ``T`` can be copy-initialized from a temporary of type
     a non-cv-qualified ``U``.
 * ``__underlying_type`` (C++, GNU, Microsoft)
-* ``__builtin_lt_synthesises_from_spaceship``, ``__builtin_gt_synthesises_from_spaceship``,
-  ``__builtin_le_synthesises_from_spaceship``, ``__builtin_ge_synthesises_from_spaceship`` (Clang):
-  These builtins can be used to determine whether the corresponding operator is synthesised from a spaceship operator.
+* ``__builtin_lt_synthesizes_from_spaceship``, ``__builtin_gt_synthesizes_from_spaceship``,
+  ``__builtin_le_synthesizes_from_spaceship``, ``__builtin_ge_synthesizes_from_spaceship`` (Clang):
+  These builtins can be used to determine whether the corresponding operator is synthesized from a spaceship operator.
 
 In addition, the following expression traits are supported:
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 79dc0b2..3b269cc 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -142,8 +142,8 @@ What's New in Clang |release|?
 C++ Language Changes
 --------------------
 
-- A new family of builtins ``__builtin_*_synthesises_from_spaceship`` has been added. These can be queried to know
-  whether the ``<`` (``lt``), ``>`` (``gt``), ``<=`` (``le``), or ``>=`` (``ge``) operators are synthesised from a
+- A new family of builtins ``__builtin_*_synthesizes_from_spaceship`` has been added. These can be queried to know
+  whether the ``<`` (``lt``), ``>`` (``gt``), ``<=`` (``le``), or ``>=`` (``ge``) operators are synthesized from a
   ``<=>``. This makes it possible to optimize certain facilities by using the ``<=>`` operation directly instead of
   doing multiple comparisons.
 
diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index db71efc..cf8bdd2 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -1105,6 +1105,13 @@ UNALIASED_CUSTOM_BUILTIN(mma_disassemble_dmr, "vv*W1024*", false,
 UNALIASED_CUSTOM_BUILTIN(mma_build_dmr, "vW1024*VVVVVVVV", false,
                          "mma,isa-future-instructions")
 
+UNALIASED_CUSTOM_BUILTIN(mma_dmsha2hash, "vW1024*W1024*Ii", true,
+                         "mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_dmsha3hash, "vW2048*Ii", true,
+                         "mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_dmxxshapad, "vW1024*VIiIiIi", true,
+                         "mma,isa-future-instructions")
+
 // MMA builtins with positive/negative multiply/accumulate.
 UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV",
                              "mma,paired-vector-memops")
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 77e5995..e98bee2 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1109,51 +1109,51 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
 }
 
 let Features = "movrs", Attributes = [NoThrow, Const] in {
@@ -4282,12 +4282,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def vdpphps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, _Float16>, _Vector<32, _Float16>)">;
-  def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">;
+  def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">;
+  def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, unsigned char>)">;
+  def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, unsigned char>)">;
+  def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, unsigned char>)">;
+  def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, unsigned char>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index af26a04..e540040 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Compiler.h"
 #include <cassert>
@@ -1367,6 +1368,22 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
 }
 
 inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
+                                             const llvm::APSInt &Int) {
+  DB.AddString(toString(Int, /*Radix=*/10, Int.isSigned(),
+                        /*formatAsCLiteral=*/false,
+                        /*UpperCase=*/true, /*InsertSeparators=*/true));
+  return DB;
+}
+
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
+                                             const llvm::APInt &Int) {
+  DB.AddString(toString(Int, /*Radix=*/10, /*Signed=*/false,
+                        /*formatAsCLiteral=*/false,
+                        /*UpperCase=*/true, /*InsertSeparators=*/true));
+  return DB;
+}
+
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
                                              int I) {
   DB.AddTaggedVal(I, DiagnosticsEngine::ak_sint);
   return DB;
diff --git a/clang/include/clang/Basic/PPCTypes.def b/clang/include/clang/Basic/PPCTypes.def
index fc4155c..9c0fa91 100644
--- a/clang/include/clang/Basic/PPCTypes.def
+++ b/clang/include/clang/Basic/PPCTypes.def
@@ -30,6 +30,7 @@
 #endif
 
 
+PPC_VECTOR_MMA_TYPE(__dmr2048, DMR2048, 2048)
 PPC_VECTOR_MMA_TYPE(__dmr1024, DMR1024, 1024)
 PPC_VECTOR_MMA_TYPE(__vector_quad, VectorQuad, 512)
 PPC_VECTOR_VSX_TYPE(__vector_pair, VectorPair, 256)
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 9d1a23d..564d601 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -552,10 +552,10 @@ TYPE_TRAIT_1(__can_pass_in_regs, CanPassInRegs, KEYCXX)
 TYPE_TRAIT_2(__reference_binds_to_temporary, ReferenceBindsToTemporary, KEYCXX)
 TYPE_TRAIT_2(__reference_constructs_from_temporary, ReferenceConstructsFromTemporary, KEYCXX)
 TYPE_TRAIT_2(__reference_converts_from_temporary, ReferenceConvertsFromTemporary, KEYCXX)
-TYPE_TRAIT_2(__builtin_lt_synthesises_from_spaceship, LtSynthesisesFromSpaceship, KEYCXX)
-TYPE_TRAIT_2(__builtin_le_synthesises_from_spaceship, LeSynthesisesFromSpaceship, KEYCXX)
-TYPE_TRAIT_2(__builtin_gt_synthesises_from_spaceship, GtSynthesisesFromSpaceship, KEYCXX)
-TYPE_TRAIT_2(__builtin_ge_synthesises_from_spaceship, GeSynthesisesFromSpaceship, KEYCXX)
+TYPE_TRAIT_2(__builtin_lt_synthesizes_from_spaceship, LtSynthesizesFromSpaceship, KEYCXX)
+TYPE_TRAIT_2(__builtin_le_synthesizes_from_spaceship, LeSynthesizesFromSpaceship, KEYCXX)
+TYPE_TRAIT_2(__builtin_gt_synthesizes_from_spaceship, GtSynthesizesFromSpaceship, KEYCXX)
+TYPE_TRAIT_2(__builtin_ge_synthesizes_from_spaceship, GeSynthesizesFromSpaceship, KEYCXX)
 // IsDeducible is only used internally by clang for CTAD implementation and
 // is not exposed to users.
 TYPE_TRAIT_2(/*EmptySpellingName*/, IsDeducible, KEYCXX)
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index a3f167e..3f83c30 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -243,6 +243,13 @@ public:
     return cir::AllocaOp::create(*this, loc, addrType, type, name, alignment);
   }
 
+  mlir::Value createAlloca(mlir::Location loc, cir::PointerType addrType,
+                           mlir::Type type, llvm::StringRef name,
+                           clang::CharUnits alignment) {
+    mlir::IntegerAttr alignmentAttr = getAlignmentAttr(alignment);
+    return createAlloca(loc, addrType, type, name, alignmentAttr);
+  }
+
   /// Get constant address of a global variable as an MLIR attribute.
   /// This wrapper infers the attribute type through the global op.
   cir::GlobalViewAttr getGlobalViewAttr(cir::GlobalOp globalOp,
diff --git a/clang/include/clang/CodeGen/ModuleBuilder.h b/clang/include/clang/CodeGen/ModuleBuilder.h
index 59b9840..f1b8229 100644
--- a/clang/include/clang/CodeGen/ModuleBuilder.h
+++ b/clang/include/clang/CodeGen/ModuleBuilder.h
@@ -52,6 +52,12 @@ namespace CodeGen {
 class CodeGenerator : public ASTConsumer {
   virtual void anchor();
 
+protected:
+  /// True if we've finished generating IR. This prevents us from generating
+  /// additional LLVM IR after emitting output in HandleTranslationUnit. This
+  /// can happen when Clang plugins trigger additional AST deserialization.
+  bool IRGenFinished = false;
+
 public:
   /// Return an opaque reference to the CodeGenModule object, which can
   /// be used in various secondary APIs.  It is valid as long as the
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 99864c7..5d09d55 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1160,7 +1160,7 @@ enum PredefinedTypeIDs {
 ///
 /// Type IDs for non-predefined types will start at
 /// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 513;
+const unsigned NUM_PREDEF_TYPE_IDS = 514;
 
 // Ensure we do not overrun the predefined types we reserved
 // in the enum PredefinedTypeIDs above.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0fd0e7e..056bfe3 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3501,6 +3501,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
     case BuiltinType::VectorQuad:
     case BuiltinType::VectorPair:
     case BuiltinType::DMR1024:
+    case BuiltinType::DMR2048:
       OS << "?";
       return;
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 4a8aac90..5596499 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -131,9 +131,7 @@ public:
                      std::string("AggExprEmitter::VisitStmt: ") +
                          s->getStmtClassName());
   }
-  void VisitParenExpr(ParenExpr *pe) {
-    cgf.cgm.errorNYI(pe->getSourceRange(), "AggExprEmitter: VisitParenExpr");
-  }
+  void VisitParenExpr(ParenExpr *pe) { Visit(pe->getSubExpr()); }
   void VisitGenericSelectionExpr(GenericSelectionExpr *ge) {
     cgf.cgm.errorNYI(ge->getSourceRange(),
                      "AggExprEmitter: VisitGenericSelectionExpr");
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index a4c2641..e41c2d85 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <numeric>
+
 #include "CIRGenOpenACCRecipe.h"
 
 namespace clang::CIRGen {
@@ -35,6 +37,110 @@ mlir::Block *OpenACCRecipeBuilderBase::createRecipeBlock(mlir::Region &region,
   return builder.createBlock(&region, region.end(), types, locs);
 }
 
+mlir::Value OpenACCRecipeBuilderBase::makeBoundsAlloca(
+    mlir::Block *block, SourceRange exprRange, mlir::Location loc,
+    std::string_view allocaName, size_t numBounds,
+    llvm::ArrayRef<QualType> boundTypes) {
+  mlir::OpBuilder::InsertionGuard guardCase(builder);
+
+  // Get the range of bounds arguments, which are all but the 1st arg.
+  llvm::ArrayRef<mlir::BlockArgument> boundsRange =
+      block->getArguments().drop_front(1);
+
+  // boundTypes contains the before and after of each bounds, so it ends up
+  // having 1 extra. Assert this is the case to ensure we don't call this in the
+  // wrong 'block'.
+  assert(boundsRange.size() + 1 == boundTypes.size());
+
+  mlir::Type itrTy = cgf.cgm.convertType(cgf.getContext().UnsignedLongLongTy);
+  auto idxType = mlir::IndexType::get(&cgf.getMLIRContext());
+
+  auto getUpperBound = [&](mlir::Value bound) {
+    auto upperBoundVal =
+        mlir::acc::GetUpperboundOp::create(builder, loc, idxType, bound);
+    return mlir::UnrealizedConversionCastOp::create(builder, loc, itrTy,
+                                                    upperBoundVal.getResult())
+        .getResult(0);
+  };
+
+  auto isArrayTy = [&](QualType ty) {
+    if (ty->isArrayType() && !ty->isConstantArrayType())
+      cgf.cgm.errorNYI(exprRange, "OpenACC recipe init for VLAs");
+    return ty->isConstantArrayType();
+  };
+
+  mlir::Type topLevelTy = cgf.convertType(boundTypes.back());
+  cir::PointerType topLevelTyPtr = builder.getPointerTo(topLevelTy);
+  // Do an alloca for the 'top' level type without bounds.
+  mlir::Value initialAlloca = builder.createAlloca(
+      loc, topLevelTyPtr, topLevelTy, allocaName,
+      cgf.getContext().getTypeAlignInChars(boundTypes.back()));
+
+  bool lastBoundWasArray = isArrayTy(boundTypes.back());
+
+  // Since we're iterating the types in reverse, this sets up for each index
+  // corresponding to the boundsRange to be the 'after application of the
+  // bounds.
+  llvm::ArrayRef<QualType> boundResults = boundTypes.drop_back(1);
+
+  // Collect the 'do we have any allocas needed after this type' list.
+  llvm::SmallVector<bool> allocasLeftArr;
+  llvm::ArrayRef<QualType> resultTypes = boundTypes.drop_front();
+  std::transform_inclusive_scan(
+      resultTypes.begin(), resultTypes.end(),
+      std::back_inserter(allocasLeftArr), std::plus<bool>{},
+      [](QualType ty) { return !ty->isConstantArrayType(); });
+
+  // Keep track of the number of 'elements' that we're allocating. Individual
+  // allocas should multiply this by the size of its current allocation.
+  mlir::Value cumulativeElts;
+  for (auto [bound, resultType, allocasLeft] : llvm::reverse(
+           llvm::zip_equal(boundsRange, boundResults, allocasLeftArr))) {
+
+    // if there is no further 'alloca' operation we need to do, we can skip
+    // creating the UB/multiplications/etc.
+    if (!allocasLeft)
+      break;
+
+    // First: figure out the number of elements in the current 'bound' list.
+    mlir::Value eltsPerSubArray = getUpperBound(bound);
+    mlir::Value eltsToAlloca;
+
+    // IF we are in a sub-bounds, the total number of elements to alloca is
+    // the product of that one and the current 'bounds' size.  That is,
+    // arr[5][5], we would need 25 elements, not just 5. Else it is just the
+    // current number of elements.
+    if (cumulativeElts)
+      eltsToAlloca = builder.createMul(loc, eltsPerSubArray, cumulativeElts);
+    else
+      eltsToAlloca = eltsPerSubArray;
+
+    if (!lastBoundWasArray) {
+      // If we have to do an allocation, figure out the size of the
+      // allocation.  alloca takes the number of bytes, not elements.
+      TypeInfoChars eltInfo = cgf.getContext().getTypeInfoInChars(resultType);
+      cir::ConstantOp eltSize = builder.getConstInt(
+          loc, itrTy, eltInfo.Width.alignTo(eltInfo.Align).getQuantity());
+      mlir::Value curSize = builder.createMul(loc, eltsToAlloca, eltSize);
+
+      mlir::Type eltTy = cgf.convertType(resultType);
+      cir::PointerType ptrTy = builder.getPointerTo(eltTy);
+      builder.createAlloca(loc, ptrTy, eltTy, "openacc.init.bounds",
+                           cgf.getContext().getTypeAlignInChars(resultType),
+                           curSize);
+
+      // TODO: OpenACC : At this point we should be copying the addresses of
+      // each element of this to the last allocation.  At the moment, that is
+      // not yet implemented.
+      cgf.cgm.errorNYI(exprRange, "OpenACC recipe alloca copying");
+    }
+
+    cumulativeElts = eltsToAlloca;
+    lastBoundWasArray = isArrayTy(resultType);
+  }
+  return initialAlloca;
+}
+
 mlir::Value
 OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
                                            mlir::Value bound,
@@ -258,7 +364,11 @@ void OpenACCRecipeBuilderBase::createPrivateInitRecipe(
         cgf.emitAutoVarAlloca(*allocaDecl, builder.saveInsertionPoint());
     cgf.emitAutoVarInit(tempDeclEmission);
   } else {
-    cgf.cgm.errorNYI(exprRange, "private-init with bounds");
+    makeBoundsAlloca(block, exprRange, loc, "openacc.private.init", numBounds,
+                     boundTypes);
+
+    if (initExpr)
+      cgf.cgm.errorNYI(exprRange, "private-init with bounds initialization");
   }
 
   mlir::acc::YieldOp::create(builder, locEnd);
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
index 978c671..acd187b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
@@ -24,6 +24,13 @@
 
 namespace clang::CIRGen {
 class OpenACCRecipeBuilderBase {
+  // This function generates the required alloca, similar to
+  // 'emitAutoVarAlloca', except for the OpenACC array/pointer types.
+  mlir::Value makeBoundsAlloca(mlir::Block *block, SourceRange exprRange,
+                               mlir::Location loc, std::string_view allocaName,
+                               size_t numBounds,
+                               llvm::ArrayRef<QualType> boundTypes);
+
 protected:
   CIRGen::CIRGenFunction &cgf;
   CIRGen::CIRGenBuilderTy &builder;
@@ -165,28 +172,9 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
         cgf.emitAutoVarAlloca(*varRecipe, builder.saveInsertionPoint());
 
     // 'firstprivate' doesn't do its initialization in the 'init' section,
-    // instead does it in the 'copy' section.  SO only do init here.
-    // 'reduction' appears to use it too (rather than a 'copy' section), so
-    // we probably have to do it here too, but we can do that when we get to
-    // reduction implementation.
-    if constexpr (std::is_same_v<RecipeTy, mlir::acc::PrivateRecipeOp>) {
-      // We are OK with no init for builtins, arrays of builtins, or pointers,
-      // else we should NYI so we know to go look for these.
-      if (cgf.getContext().getLangOpts().CPlusPlus &&
-          !varRecipe->getType()
-               ->getPointeeOrArrayElementType()
-               ->isBuiltinType() &&
-          !varRecipe->getType()->isPointerType() && !varRecipe->getInit()) {
-        // If we don't have any initialization recipe, we failed during Sema to
-        // initialize this correctly. If we disable the
-        // Sema::TentativeAnalysisScopes in SemaOpenACC::CreateInitRecipe, it'll
-        // emit an error to tell us.  However, emitting those errors during
-        // production is a violation of the standard, so we cannot do them.
-        cgf.cgm.errorNYI(exprRange, "private default-init recipe");
-      }
-      cgf.emitAutoVarInit(tempDeclEmission);
-    } else if constexpr (std::is_same_v<RecipeTy,
-                                        mlir::acc::ReductionRecipeOp>) {
+    // instead it does it in the 'copy' section.  SO, only do 'init' here for
+    // reduction.
+    if constexpr (std::is_same_v<RecipeTy, mlir::acc::ReductionRecipeOp>) {
       // Unlike Private, the recipe here is always required as it has to do
       // init, not just 'default' init.
       if (!varRecipe->getInit())
diff --git a/clang/lib/CodeGen/BackendConsumer.h b/clang/lib/CodeGen/BackendConsumer.h
index ad3adfc..b7bbb81 100644
--- a/clang/lib/CodeGen/BackendConsumer.h
+++ b/clang/lib/CodeGen/BackendConsumer.h
@@ -40,11 +40,6 @@ class BackendConsumer : public ASTConsumer {
   llvm::Timer LLVMIRGeneration;
   unsigned LLVMIRGenerationRefCount = 0;
 
-  /// True if we've finished generating IR. This prevents us from generating
-  /// additional LLVM IR after emitting output in HandleTranslationUnit. This
-  /// can happen when Clang plugins trigger additional AST deserialization.
-  bool IRGenFinished = false;
-
   bool TimerIsEnabled = false;
 
   BackendAction Action;
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 9286f1f2..60d6b7f 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -190,9 +190,7 @@ void BackendConsumer::HandleInlineFunctionDefinition(FunctionDecl *D) {
 }
 
 void BackendConsumer::HandleInterestingDecl(DeclGroupRef D) {
-  // Ignore interesting decls from the AST reader after IRGen is finished.
-  if (!IRGenFinished)
-    HandleTopLevelDecl(D);
+  HandleTopLevelDecl(D);
 }
 
 // Links each entry in LinkModules into our module. Returns true on error.
@@ -243,8 +241,6 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
 
     if (TimerIsEnabled && !--LLVMIRGenerationRefCount)
       LLVMIRGeneration.yieldTo(CI.getFrontendTimer());
-
-    IRGenFinished = true;
   }
 
   // Silently ignore if we weren't initialized for some reason.
diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
index 8c1fee8..96f3f62 100644
--- a/clang/lib/CodeGen/ModuleBuilder.cpp
+++ b/clang/lib/CodeGen/ModuleBuilder.cpp
@@ -138,6 +138,8 @@ namespace {
       assert(!M && "Replacing existing Module?");
       M.reset(new llvm::Module(ExpandModuleName(ModuleName, CodeGenOpts), C));
 
+      IRGenFinished = false;
+
       std::unique_ptr<CodeGenModule> OldBuilder = std::move(Builder);
 
       Initialize(*Ctx);
@@ -179,6 +181,10 @@ namespace {
     }
 
     bool HandleTopLevelDecl(DeclGroupRef DG) override {
+      // Ignore interesting decls from the AST reader after IRGen is finished.
+      if (IRGenFinished)
+        return true; // We can't CodeGen more but pass to other consumers.
+
       // FIXME: Why not return false and abort parsing?
       if (Diags.hasUnrecoverableErrorOccurred())
         return true;
@@ -292,8 +298,9 @@ namespace {
         if (Builder)
           Builder->clear();
         M.reset();
-        return;
       }
+
+      IRGenFinished = true;
     }
 
     void AssignInheritanceModel(CXXRecordDecl *RD) override {
diff --git a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
index ba65cf1..e71dc9e 100644
--- a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
@@ -1153,7 +1153,8 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     }
     if (BuiltinID == PPC::BI__builtin_mma_dmmr ||
         BuiltinID == PPC::BI__builtin_mma_dmxor ||
-        BuiltinID == PPC::BI__builtin_mma_disassemble_dmr) {
+        BuiltinID == PPC::BI__builtin_mma_disassemble_dmr ||
+        BuiltinID == PPC::BI__builtin_mma_dmsha2hash) {
       Address Addr = EmitPointerWithAlignment(E->getArg(1));
       Ops[1] = Builder.CreateLoad(Addr);
     }
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index e04b0e7..a28446a 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -55,7 +55,7 @@ namespace format {
   TYPE(ConflictAlternative)                                                    \
   TYPE(ConflictEnd)                                                            \
   TYPE(ConflictStart)                                                          \
-  /* l_brace of if/for/while */                                                \
+  /* l_brace of if/for/while/switch/catch */                                   \
   TYPE(ControlStatementLBrace)                                                 \
   TYPE(ControlStatementRBrace)                                                 \
   TYPE(CppCastLParen)                                                          \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 67066a1..0c9c88a 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -4021,29 +4021,28 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
     }
   }
 
-  if (IsCpp &&
-      (LineIsFunctionDeclaration ||
-       (FirstNonComment && FirstNonComment->is(TT_CtorDtorDeclName))) &&
-      Line.endsWith(tok::semi, tok::r_brace)) {
-    auto *Tok = Line.Last->Previous;
-    while (Tok->isNot(tok::r_brace))
-      Tok = Tok->Previous;
-    if (auto *LBrace = Tok->MatchingParen; LBrace && LBrace->is(TT_Unknown)) {
-      assert(LBrace->is(tok::l_brace));
-      Tok->setBlockKind(BK_Block);
-      LBrace->setBlockKind(BK_Block);
-      LBrace->setFinalizedType(TT_FunctionLBrace);
+  if (IsCpp) {
+    if ((LineIsFunctionDeclaration ||
+         (FirstNonComment && FirstNonComment->is(TT_CtorDtorDeclName))) &&
+        Line.endsWith(tok::semi, tok::r_brace)) {
+      auto *Tok = Line.Last->Previous;
+      while (Tok->isNot(tok::r_brace))
+        Tok = Tok->Previous;
+      if (auto *LBrace = Tok->MatchingParen; LBrace && LBrace->is(TT_Unknown)) {
+        assert(LBrace->is(tok::l_brace));
+        Tok->setBlockKind(BK_Block);
+        LBrace->setBlockKind(BK_Block);
+        LBrace->setFinalizedType(TT_FunctionLBrace);
+      }
     }
-  }
 
-  if (IsCpp && SeenName && AfterLastAttribute &&
-      mustBreakAfterAttributes(*AfterLastAttribute, Style)) {
-    AfterLastAttribute->MustBreakBefore = true;
-    if (LineIsFunctionDeclaration)
-      Line.ReturnTypeWrapped = true;
-  }
+    if (SeenName && AfterLastAttribute &&
+        mustBreakAfterAttributes(*AfterLastAttribute, Style)) {
+      AfterLastAttribute->MustBreakBefore = true;
+      if (LineIsFunctionDeclaration)
+        Line.ReturnTypeWrapped = true;
+    }
 
-  if (IsCpp) {
     if (!LineIsFunctionDeclaration) {
       // Annotate */&/&& in `operator` function calls as binary operators.
       for (const auto *Tok = FirstNonComment; Tok; Tok = Tok->Next) {
@@ -4089,6 +4088,11 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
     }
   }
 
+  if (First->is(TT_ElseLBrace)) {
+    First->CanBreakBefore = true;
+    First->MustBreakBefore = true;
+  }
+
   bool InFunctionDecl = Line.MightBeFunctionDecl;
   bool InParameterList = false;
   for (auto *Current = First->Next; Current; Current = Current->Next) {
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index 7424958..d7d56b8 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -971,14 +971,17 @@ void DumpModuleInfoAction::ExecuteAction() {
     // Emit the macro definitions in the module file so that we can know how
     // much definitions in the module file quickly.
     // TODO: Emit the macro definition bodies completely.
-    if (auto FilteredMacros = llvm::make_filter_range(
-            R->getPreprocessor().macros(),
-            [](const auto &Macro) { return Macro.first->isFromAST(); });
-        !FilteredMacros.empty()) {
-      Out << "   Macro Definitions:\n";
-      for (/*<IdentifierInfo *, MacroState> pair*/ const auto &Macro :
-           FilteredMacros)
-        Out << "     " << Macro.first->getName() << "\n";
+    {
+      std::vector<StringRef> MacroNames;
+      for (const auto &M : R->getPreprocessor().macros()) {
+        if (M.first->isFromAST())
+          MacroNames.push_back(M.first->getName());
+      }
+      llvm::sort(MacroNames);
+      if (!MacroNames.empty())
+        Out << "   Macro Definitions:\n";
+      for (StringRef Name : MacroNames)
+        Out << "     " << Name << "\n";
     }
 
     // Now let's print out any modules we did not see as part of the Primary.
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index 67679fc..fdb57c7 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -64,8 +64,8 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
                                                                  __m512i __A,
                                                                  __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v64qi)__A,
+                                             (__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -84,8 +84,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
                                                                   __m512i __A,
                                                                   __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v64qi)__A,
+                                              (__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
@@ -104,8 +104,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
                                                                  __m512i __A,
                                                                  __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v64qi)__A,
+                                             (__v64qu)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -124,8 +124,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
                                                                   __m512i __A,
                                                                   __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v64qi)__A,
+                                              (__v64qu)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
@@ -144,8 +144,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
                                                                  __m512i __A,
                                                                  __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v64qu)__A,
+                                             (__v64qu)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -164,8 +164,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
                                                                   __m512i __A,
                                                                   __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v64qu)__A,
+                                              (__v64qu)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h
index c211620..858b66b 100644
--- a/clang/lib/Headers/avxvnniint8intrin.h
+++ b/clang/lib/Headers/avxvnniint8intrin.h
@@ -14,6 +14,7 @@
 #ifndef __AVXVNNIINT8INTRIN_H
 #define __AVXVNNIINT8INTRIN_H
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -44,10 +45,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbssd_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v16qi)(__A),          \
+                                       (__v16qi)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -78,10 +81,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbssd_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v32qi)(__A),          \
+                                       (__v32qi)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -94,7 +99,7 @@
 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSSDS instruction.
 ///
 /// \param __A
 ///    A 128-bit vector of [16 x char].
@@ -113,10 +118,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbssds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v16qi)(__A),         \
+                                        (__v16qi)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -129,7 +136,7 @@
 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSSDS instruction.
 ///
 /// \param __A
 ///    A 256-bit vector of [32 x char].
@@ -148,10 +155,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbssds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v32qi)(__A),         \
+                                        (__v32qi)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -163,7 +172,7 @@
 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUD instruction.
 ///
 /// \param __A
 ///    A 128-bit vector of [16 x char].
@@ -182,10 +191,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbsud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v16qi)(__A),          \
+                                       (__v16qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -197,7 +208,7 @@
 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUD instruction.
 ///
 /// \param __A
 ///    A 256-bit vector of [32 x char].
@@ -216,10 +227,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbsud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v32qi)(__A),          \
+                                       (__v32qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -232,7 +245,7 @@
 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUDS instruction.
 ///
 /// \param __A
 ///    A 128-bit vector of [16 x char].
@@ -251,10 +264,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbsuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v16qi)(__A),         \
+                                        (__v16qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -267,7 +282,7 @@
 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUDS instruction.
 ///
 /// \param __A
 ///    A 256-bit vector of [32 x char].
@@ -286,10 +301,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbsuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v32qi)(__A),         \
+                                        (__v32qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -301,7 +318,7 @@
 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBUUD instruction.
 ///
 /// \param __A
 ///    A 128-bit vector of [16 x unsigned char].
@@ -320,10 +337,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbuud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v16qu)(__A),          \
+                                       (__v16qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -335,7 +354,7 @@
 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBUUD instruction.
 ///
 /// \param __A
 ///    A 256-bit vector of [32 x unsigned char].
@@ -354,10 +373,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbuud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v32qu)(__A),          \
+                                       (__v32qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -389,10 +410,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbuuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v16qu)(__A),         \
+                                        (__v16qu)(__B)))
 
+// clang-format off
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
 ///    32-bit integer in \a __W with signed saturation, and store the packed
@@ -423,8 +446,9 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbuuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v32qu)(__A),         \
+                                        (__v32qu)(__B)))
 
 #endif // __AVXVNNIINT8INTRIN_H
diff --git a/clang/lib/Interpreter/IncrementalAction.cpp b/clang/lib/Interpreter/IncrementalAction.cpp
index 4d1bc4c..3d489fc 100644
--- a/clang/lib/Interpreter/IncrementalAction.cpp
+++ b/clang/lib/Interpreter/IncrementalAction.cpp
@@ -106,7 +106,8 @@ std::unique_ptr<llvm::Module> IncrementalAction::GenModule() {
     // around we created an empty module to make CodeGen happy. We should make
     // sure it always stays empty.
     assert(((!CachedInCodeGenModule ||
-             !CI.getPreprocessorOpts().Includes.empty()) ||
+             !CI.getPreprocessorOpts().Includes.empty() ||
+             !CI.getPreprocessorOpts().ImplicitPCHInclude.empty()) ||
             (CachedInCodeGenModule->empty() &&
              CachedInCodeGenModule->global_empty() &&
              CachedInCodeGenModule->alias_empty() &&
diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp
index 32d1663..bf08911 100644
--- a/clang/lib/Interpreter/IncrementalParser.cpp
+++ b/clang/lib/Interpreter/IncrementalParser.cpp
@@ -37,6 +37,10 @@ IncrementalParser::IncrementalParser(CompilerInstance &Instance,
   llvm::ErrorAsOutParameter EAO(&Err);
   Consumer = &S.getASTConsumer();
   P.reset(new Parser(S.getPreprocessor(), S, /*SkipBodies=*/false));
+
+  if (ExternalASTSource *External = S.getASTContext().getExternalSource())
+    External->StartTranslationUnit(Consumer);
+
   P->Initialize();
 }
 
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index 9cc1c45..b05cb5a 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -278,9 +278,10 @@ Interpreter::Interpreter(std::unique_ptr<CompilerInstance> Instance,
 
   if (Act->getCodeGen()) {
     Act->CacheCodeGenModule();
-    // The initial PTU is filled by `-include` or by CUDA includes
-    // automatically.
-    if (!CI->getPreprocessorOpts().Includes.empty()) {
+    // The initial PTU is filled by `-include`/`-include-pch` or by CUDA
+    // includes automatically.
+    if (!CI->getPreprocessorOpts().Includes.empty() ||
+        !CI->getPreprocessorOpts().ImplicitPCHInclude.empty()) {
       // We can't really directly pass the CachedInCodeGenModule to the Jit
       // because it will steal it, causing dangling references as explained in
       // Interpreter::Execute
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 9ef7a26..0069b08 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -18909,8 +18909,7 @@ ExprResult Sema::VerifyBitField(SourceLocation FieldLoc,
     // 'bool'.
     if (BitfieldIsOverwide && !FieldTy->isBooleanType() && FieldName) {
       Diag(FieldLoc, diag::warn_bitfield_width_exceeds_type_width)
-          << FieldName << toString(Value, 10)
-          << (unsigned)TypeWidth;
+          << FieldName << Value << (unsigned)TypeWidth;
     }
   }
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 3302bfc..06b2529 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -16791,12 +16791,11 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
   Expr *OrigExpr = E;
   bool IsMS = false;
 
-  // CUDA device code does not support varargs.
+  // CUDA device global function does not support varargs.
   if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) {
     if (const FunctionDecl *F = dyn_cast<FunctionDecl>(CurContext)) {
       CUDAFunctionTarget T = CUDA().IdentifyTarget(F);
-      if (T == CUDAFunctionTarget::Global || T == CUDAFunctionTarget::Device ||
-          T == CUDAFunctionTarget::HostDevice)
+      if (T == CUDAFunctionTarget::Global)
         return ExprError(Diag(E->getBeginLoc(), diag::err_va_arg_in_device));
     }
   }
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 6c798d6..3e34675 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -1830,10 +1830,10 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT,
 
     return Self.HLSL().IsScalarizedLayoutCompatible(LhsT, RhsT);
   }
-  case BTT_LtSynthesisesFromSpaceship:
-  case BTT_LeSynthesisesFromSpaceship:
-  case BTT_GtSynthesisesFromSpaceship:
-  case BTT_GeSynthesisesFromSpaceship: {
+  case BTT_LtSynthesizesFromSpaceship:
+  case BTT_LeSynthesizesFromSpaceship:
+  case BTT_GtSynthesizesFromSpaceship:
+  case BTT_GeSynthesizesFromSpaceship: {
     EnterExpressionEvaluationContext UnevaluatedContext(
         Self, Sema::ExpressionEvaluationContext::Unevaluated);
     Sema::SFINAETrap SFINAE(Self, /*ForValidityCheck=*/true);
@@ -1852,13 +1852,13 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT,
 
     auto OpKind = [&] {
       switch (BTT) {
-      case BTT_LtSynthesisesFromSpaceship:
+      case BTT_LtSynthesizesFromSpaceship:
         return BinaryOperatorKind::BO_LT;
-      case BTT_LeSynthesisesFromSpaceship:
+      case BTT_LeSynthesizesFromSpaceship:
         return BinaryOperatorKind::BO_LE;
-      case BTT_GtSynthesisesFromSpaceship:
+      case BTT_GtSynthesizesFromSpaceship:
         return BinaryOperatorKind::BO_GT;
-      case BTT_GeSynthesisesFromSpaceship:
+      case BTT_GeSynthesizesFromSpaceship:
         return BinaryOperatorKind::BO_GE;
       default:
         llvm_unreachable("Trying to Synthesize non-comparison operator?");
diff --git a/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
index e64153d..309e3d2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
@@ -129,7 +129,8 @@ public:
       llvm::errs() << " {argno: " << Call.getNumArgs() << '}';
       llvm::errs() << " [" << Call.getKindAsString() << ']';
       llvm::errs() << '\n';
-      return true;
+      // We can't return `true` from this callback without binding the return
+      // value. Let's just fallthrough here and return `false`.
     }
     return false;
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
index 392c7ee..c716235 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
@@ -262,6 +262,15 @@ public:
   /// state. This callback allows a checker to provide domain specific knowledge
   /// about the particular functions it knows about.
   ///
+  /// Note that to evaluate a call, the handler MUST bind the return value if
+  /// its a non-void function. Invalidate the arguments if necessary.
+  ///
+  /// Note that in general, user-provided functions should not be eval-called
+  /// because the checker can't predict the exact semantics/contract of the
+  /// callee, and by having the eval::Call callback, we also prevent it from
+  /// getting inlined, potentially regressing analysis quality.
+  /// Consider using check::PreCall or check::PostCall to allow inlining.
+  ///
   /// \returns true if the call has been successfully evaluated
   /// and false otherwise. Note, that only one checker can evaluate a call. If
   /// more than one checker claims that they can evaluate the same call the
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index dee34e3..75d7e26 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -909,7 +909,14 @@ void ExprEngine::VisitCXXNewAllocatorCall(const CXXNewExpr *CNE,
   ExplodedNodeSet DstPostCall;
   StmtNodeBuilder CallBldr(DstPreCall, DstPostCall, *currBldrCtx);
   for (ExplodedNode *I : DstPreCall) {
-    // FIXME: Provide evalCall for checkers?
+    // Operator new calls (CXXNewExpr) are intentionally not eval-called,
+    // because it does not make sense to eval-call user-provided functions.
+    // 1) If the new operator can be inlined, then don't prevent it from
+    //    inlining by having an eval-call of that operator.
+    // 2) If it can't be inlined, then the default conservative modeling
+    //    is what we want anyway.
+    // So the best is to not allow eval-calling CXXNewExprs from checkers.
+    // Checkers can provide their pre/post-call callbacks if needed.
     defaultEvalCall(CallBldr, I, *Call);
   }
   // If the call is inlined, DstPostCall will be empty and we bail out now.
@@ -1110,6 +1117,10 @@ void ExprEngine::VisitCXXDeleteExpr(const CXXDeleteExpr *CDE,
   if (AMgr.getAnalyzerOptions().MayInlineCXXAllocator) {
     StmtNodeBuilder Bldr(DstPreCall, DstPostCall, *currBldrCtx);
     for (ExplodedNode *I : DstPreCall) {
+      // Intentionally either inline or conservative eval-call the operator
+      // delete, but avoid triggering an eval-call event for checkers.
+      // As detailed at handling CXXNewExprs, in short, because it does not
+      // really make sense to eval-call user-provided functions.
       defaultEvalCall(Bldr, I, *Call);
     }
   } else {
diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
index 995019c..2856842 100644
--- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
+++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
@@ -123,6 +123,15 @@ static types::ID foldType(types::ID Lang) {
   }
 }
 
+// Return the language standard that's activated by the /std:c++latest
+// flag in clang-CL mode.
+static LangStandard::Kind latestLangStandard() {
+  // FIXME: Have a single source of truth for the mapping from
+  // c++latest --> c++26 that's shared by the driver code
+  // (clang/lib/Driver/ToolChains/Clang.cpp) and this file.
+  return LangStandard::lang_cxx26;
+}
+
 // A CompileCommand that can be applied to another file.
 struct TransferableCommand {
   // Flags that should not apply to all files are stripped from CommandLine.
@@ -237,9 +246,16 @@ struct TransferableCommand {
     // --std flag may only be transferred if the language is the same.
     // We may consider "translating" these, e.g. c++11 -> c11.
     if (Std != LangStandard::lang_unspecified && foldType(TargetType) == Type) {
-      Result.CommandLine.emplace_back((
-          llvm::Twine(ClangCLMode ? "/std:" : "-std=") +
-          LangStandard::getLangStandardForKind(Std).getName()).str());
+      const char *Spelling =
+          LangStandard::getLangStandardForKind(Std).getName();
+      // In clang-cl mode, the latest standard is spelled 'c++latest' rather
+      // than e.g. 'c++26', and the driver does not accept the latter, so emit
+      // the spelling that the driver does accept.
+      if (ClangCLMode && Std == latestLangStandard()) {
+        Spelling = "c++latest";
+      }
+      Result.CommandLine.emplace_back(
+          (llvm::Twine(ClangCLMode ? "/std:" : "-std=") + Spelling).str());
     }
     Result.CommandLine.push_back("--");
     Result.CommandLine.push_back(std::string(Filename));
@@ -296,8 +312,14 @@ private:
   // Try to interpret the argument as '-std='.
   std::optional<LangStandard::Kind> tryParseStdArg(const llvm::opt::Arg &Arg) {
     using namespace driver::options;
-    if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ))
+    if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ)) {
+      // "c++latest" is not a recognized LangStandard, but it's accepted by
+      // the clang driver in CL mode.
+      if (ClangCLMode && StringRef(Arg.getValue()) == "c++latest") {
+        return latestLangStandard();
+      }
       return LangStandard::getLangKind(Arg.getValue());
+    }
     return std::nullopt;
   }
 };
diff --git a/clang/test/AST/ast-dump-ppc-types.c b/clang/test/AST/ast-dump-ppc-types.c
index 1c860c2..6112af5 100644
--- a/clang/test/AST/ast-dump-ppc-types.c
+++ b/clang/test/AST/ast-dump-ppc-types.c
@@ -17,6 +17,8 @@
 // are correctly defined. We also added checks on a couple of other targets to
 // ensure the types are target-dependent.
 
+// CHECK: TypedefDecl {{.*}} implicit __dmr2048 '__dmr2048'
+// CHECK: `-BuiltinType {{.*}} '__dmr2048'
 // CHECK: TypedefDecl {{.*}} implicit __dmr1024 '__dmr1024'
 // CHECK: `-BuiltinType {{.*}} '__dmr1024'
 // CHECK: TypedefDecl {{.*}} implicit __vector_quad '__vector_quad'
diff --git a/clang/test/Analysis/cxxctr-evalcall-analysis-order.cpp b/clang/test/Analysis/cxxctr-evalcall-analysis-order.cpp
index 0e1ec2f9..743c5ad 100644
--- a/clang/test/Analysis/cxxctr-evalcall-analysis-order.cpp
+++ b/clang/test/Analysis/cxxctr-evalcall-analysis-order.cpp
@@ -18,16 +18,33 @@ void foo() {
   C C0;
   C C1(42);
   C *C2 = new C{2, 3};
+  delete C2;
 }
 
 // CHECK:  PreCall (C::C) [CXXConstructorCall]
 // CHECK-NEXT:  EvalCall (C::C) {argno: 0} [CXXConstructorCall]
 // CHECK-NEXT:  PostCall (C::C) [CXXConstructorCall]
+
 // CHECK-NEXT:  PreCall (C::C) [CXXConstructorCall]
 // CHECK-NEXT:  EvalCall (C::C) {argno: 1} [CXXConstructorCall]
 // CHECK-NEXT:  PostCall (C::C) [CXXConstructorCall]
+
 // CHECK-NEXT:  PreCall (operator new) [CXXAllocatorCall]
+//    COMMENT: Operator new calls (CXXNewExpr) are intentionally not eval-called,
+//    COMMENT: because it does not make sense to eval call user-provided functions.
+//    COMMENT: 1) If the new operator can be inlined, then don't prevent it from
+//    COMMENT:    inlining by having an eval-call of that operator.
+//    COMMENT: 2) If it can't be inlined, then the default conservative modeling
+//    COMMENT:    is what we anyways want anyway.
+//    COMMENT: So the EvalCall event will not be triggered for operator new calls.
+// CHECK-NOT:   EvalCall
 // CHECK-NEXT:  PostCall (operator new) [CXXAllocatorCall]
+
 // CHECK-NEXT:  PreCall (C::C) [CXXConstructorCall]
 // CHECK-NEXT:  EvalCall (C::C) {argno: 2} [CXXConstructorCall]
 // CHECK-NEXT:  PostCall (C::C) [CXXConstructorCall]
+
+// CHECK-NEXT: PreCall (operator delete) [CXXDeallocatorCall]
+//    COMMENT: Same reasoning as for CXXNewExprs above.
+// CHECK-NOT:  EvalCall
+// CHECK-NEXT: PostCall (operator delete) [CXXDeallocatorCall]
diff --git a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif
index 85e710f..501d27c 100644
--- a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif
+++ b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif
@@ -141,4 +141,4 @@
     }
   ],
   "version": "[SARIF version]"
-}
-\ No newline at end of file
+}
diff --git a/clang/test/Analysis/lit.local.cfg b/clang/test/Analysis/lit.local.cfg
index 3d60a16..03ab418 100644
--- a/clang/test/Analysis/lit.local.cfg
+++ b/clang/test/Analysis/lit.local.cfg
@@ -17,11 +17,13 @@ config.substitutions.append(
     )
 )
 
+sed_cmd = "/opt/freeware/bin/sed" if "system-aix" in config.available_features else "sed"
+
 # Filtering command for testing SARIF output against reference output.
 config.substitutions.append(
     (
         "%normalize_sarif",
-        "sed -r '%s;%s;%s;%s'"
+        f"{sed_cmd} -r '%s;%s;%s;%s'"
         % (
             # Replace version strings that are likely to change.
             r's/"version": ".* version .*"/"version": "[clang version]"/',
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index ee6c4cab..7537428 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -93,3 +93,39 @@ void f3() {
 // OGCG:   %[[O:.*]] = alloca %struct.Outer, align 4
 // OGCG:   %[[O_I:.*]] = getelementptr inbounds nuw %struct.Outer, ptr %[[O]], i32 0, i32 0
 // OGCG:   %[[O_I_N:.*]] = getelementptr inbounds nuw %struct.Inner, ptr %[[O_I]], i32 0, i32 0
+
+void paren_expr() {
+  struct Point {
+    int x;
+    int y;
+  };
+
+  Point a = (Point{});
+  Point b = (a);
+}
+
+// CIR: cir.func{{.*}} @_Z10paren_exprv()
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_Point, !cir.ptr<!rec_Point>, ["a", init]
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !rec_Point, !cir.ptr<!rec_Point>, ["b", init]
+// CIR:   %[[X_ELEM_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "x"} : !cir.ptr<!rec_Point> -> !cir.ptr<!s32i>
+// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store{{.*}} %[[CONST_0]], %[[X_ELEM_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[Y_ELEM_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "y"} : !cir.ptr<!rec_Point> -> !cir.ptr<!s32i>
+// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store{{.*}} %[[CONST_0]], %[[Y_ELEM_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR:   cir.call @_ZZ10paren_exprvEN5PointC1ERKS_(%[[B_ADDR]], %[[A_ADDR]]) nothrow : (!cir.ptr<!rec_Point>, !cir.ptr<!rec_Point>) -> ()
+
+// LLVM: define{{.*}} void @_Z10paren_exprv()
+// LLVM:   %[[A_ADDR:.*]] = alloca %struct.Point, i64 1, align 4
+// LLVM:   %[[B_ADDR:.*]] = alloca %struct.Point, i64 1, align 4
+// LLVM:   %[[X_ELEM_PTR:.*]] = getelementptr %struct.Point, ptr %[[A_ADDR]], i32 0, i32 0
+// LLVM:   store i32 0, ptr %[[X_ELEM_PTR]], align 4
+// LLVM:   %[[Y_ELEM_PTR:.*]] = getelementptr %struct.Point, ptr %[[A_ADDR]], i32 0, i32 1
+// LLVM:   store i32 0, ptr %[[Y_ELEM_PTR]], align 4
+// LLVM:   call void @_ZZ10paren_exprvEN5PointC1ERKS_(ptr %[[B_ADDR]], ptr %[[A_ADDR]])
+
+// OGCG: define{{.*}} void @_Z10paren_exprv()
+// OGCG:   %[[A_ADDR:.*]] = alloca %struct.Point, align 4
+// OGCG:   %[[B_ADDR:.*]] = alloca %struct.Point, align 4
+// OGCG:   call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false)
+// OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[B_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
index f3ec9e1..63932027 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
@@ -65,6 +65,7 @@ struct HasDtor {
 // int[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -72,6 +73,7 @@ struct HasDtor {
 // float[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -79,6 +81,7 @@ struct HasDtor {
 // NoCopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -86,6 +89,7 @@ struct HasDtor {
 // CopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -93,6 +97,7 @@ struct HasDtor {
 // NonDefaultCtor[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -100,6 +105,7 @@ struct HasDtor {
 // HasDtor[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
index 5235aee..097005e 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoCopyConstruct {};
 
@@ -26,6 +26,7 @@ struct NoCopyConstruct {};
 // int[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -33,6 +34,7 @@ struct NoCopyConstruct {};
 // float[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -40,6 +42,7 @@ struct NoCopyConstruct {};
 // NoCopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
index 12e14fa..97399d9 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
@@ -58,36 +58,42 @@ struct HasDtor {
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
index 0a0552e..d4fd4cc 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
@@ -65,6 +65,7 @@ struct HasDtor {
 // int[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -72,6 +73,7 @@ struct HasDtor {
 // float[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -79,6 +81,7 @@ struct HasDtor {
 // NoCopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -86,6 +89,7 @@ struct HasDtor {
 // CopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -93,6 +97,7 @@ struct HasDtor {
 // NonDefaultCtor[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -100,6 +105,7 @@ struct HasDtor {
 // HasDtor[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
index 561bf70..c62ebe2 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
@@ -13,6 +13,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OneArr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_8CtorDtor : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CtorDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -98,6 +99,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!rec_CtorDtor x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
@@ -214,6 +216,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B][B])
 // CHECK-NEXT:acc.private.recipe @privatization__Bcnt3__ZTSA5_A5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
@@ -306,6 +309,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
index ad33ffd..38df813 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
@@ -8,6 +8,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OneArr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_5NoOps : !cir.ptr<!cir.array<!rec_NoOps x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!rec_NoOps x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -43,6 +44,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!rec_NoOps x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
@@ -81,6 +83,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B][B])
 // CHECK-NEXT:acc.private.recipe @privatization__Bcnt3__ZTSA5_A5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
@@ -94,6 +97,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
index b3eafc0..3d4aaa0 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void do_things(unsigned A, unsigned B) {
@@ -6,6 +6,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OneArr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -24,6 +25,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_i : !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
@@ -44,6 +46,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B][B])
 // CHECK-NEXT:acc.private.recipe @privatization__Bcnt3__ZTSA5_A5_A5_i : !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
@@ -57,6 +60,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_A5_i : !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
index be999395..52bcd7c 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
@@ -13,6 +13,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OnePtr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSP8CtorDtor : !cir.ptr<!cir.ptr<!rec_CtorDtor>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.ptr<!rec_CtorDtor>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -62,6 +69,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -136,6 +157,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA3:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -229,6 +271,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -250,6 +306,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtr[B][A:B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_P8CtorDtor : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_CtorDtor> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -325,6 +392,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrToArrays[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_8CtorDtor : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -399,6 +473,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSA5_PP8CtorDtor : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -492,6 +584,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_PP8CtorDtor : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -512,6 +615,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPA5_8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -605,6 +722,19 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPA5_8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -700,6 +830,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPA5_P8CtorDtor : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_CtorDtor> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// 
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -793,6 +941,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_P8CtorDtor : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_CtorDtor> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
index fa00e6a..4398216 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
@@ -8,6 +8,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OnePtr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSP5NoOps : !cir.ptr<!cir.ptr<!rec_NoOps>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.ptr<!rec_NoOps>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -26,6 +33,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -46,6 +67,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA3:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -59,6 +101,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -80,6 +136,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtr[B][A:B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_P5NoOps : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -102,6 +169,13 @@ void do_things(unsigned A, unsigned B) {
   ;
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_5NoOps : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -121,6 +195,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSA5_PP5NoOps : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -134,6 +226,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_PP5NoOps : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -154,6 +257,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPA5_5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -167,6 +284,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPA5_5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -189,6 +320,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPA5_P5NoOps : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -202,6 +351,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_P5NoOps : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
index 867aaa6..79692d3 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
@@ -6,6 +6,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OnePtr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSPi : !cir.ptr<!cir.ptr<!s32i>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.ptr<!s32i>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -24,6 +31,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPi : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -44,6 +65,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA3:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -57,6 +99,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -78,6 +134,16 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtr[B][A:B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_Pi : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 5>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, ["openacc.private.init"] {alignment = 8 : i64} 
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -99,6 +165,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrToArrays[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_i : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -119,6 +192,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSA5_PPi : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -132,6 +223,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_PPi : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -152,6 +254,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPA5_i : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} 
@@ -165,6 +281,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPA5_i : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -187,6 +317,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPA5_Pi : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 5>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -201,6 +349,13 @@ void do_things(unsigned A, unsigned B) {
 // #pragma acc parallel private(PtrArrayPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_Pi : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 5>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
index cd8e476..77ff357 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
@@ -22,6 +22,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -29,6 +36,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUNDS1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -42,6 +63,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][A:B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUNDS1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -144,6 +186,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -151,6 +200,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[BOUNDS1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -229,6 +292,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSP8CtorDtor : !cir.ptr<!cir.ptr<!rec_CtorDtor>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!rec_CtorDtor>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_CTORDTOR:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_CTORDTOR]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
index 4d91d86..4822dd7 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
@@ -16,6 +16,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[A])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -23,6 +30,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -34,6 +55,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -57,6 +99,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -64,6 +113,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -85,6 +148,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSP5NoOps : !cir.ptr<!cir.ptr<!rec_NoOps>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!rec_NoOps>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_NOOPS:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_NOOPS]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
index 5c9c17b..ddf25de 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
@@ -14,6 +14,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[A])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -21,6 +28,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -32,6 +53,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -55,6 +97,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPPi : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -62,6 +111,21 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPi : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -83,6 +147,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSPi : !cir.ptr<!cir.ptr<!s32i>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!s32i>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
index f626567..d8306a7 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
@@ -208,6 +208,75 @@ void test_dmf_basic2(char *p1, char *res1, char *res2,
   __builtin_mma_build_dmr((__dmr1024*)res2, vv, vv, vv, vv, vv, vv, vv, vv);
   __builtin_mma_disassemble_dmr(res1, (__dmr1024*)p1);
 }
+
+// CHECK-LABEL: define dso_local void @test_dmsha2hash(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP1:%.*]], ptr noundef readonly captures(none) [[VDMRP2:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+// AIX-LABEL: define void @test_dmsha2hash(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP1:%.*]], ptr noundef readonly captures(none) [[VDMRP2:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    ret void
+//
+void test_dmsha2hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned char *resp) {
+  __dmr1024 vdmr1 = *((__dmr1024 *)vdmrp1);
+  __dmr1024 vdmr2 = *((__dmr1024 *)vdmrp2);
+  __builtin_mma_dmsha2hash(&vdmr1, &vdmr2, 1);
+  *((__dmr1024 *)resp) = vdmr1;
+}
+
+// CHECK-LABEL: define dso_local void @test_dmsha3hash(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA9:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
+// CHECK-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA9]]
+// CHECK-NEXT:    ret void
+//
+// AIX-LABEL: define void @test_dmsha3hash(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA9:![0-9]+]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
+// AIX-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA9]]
+// AIX-NEXT:    ret void
+//
+void test_dmsha3hash(unsigned char *vdmrpp,  unsigned char *resp) {
+  __dmr2048 vdmrp = *((__dmr2048 *)vdmrpp);
+  __builtin_mma_dmsha3hash(&vdmrp, 4);
+  *((__dmr2048 *)resp) = vdmrp;
+}
+
+// CHECK-LABEL: define dso_local void @test_dmxxshapad(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 5)
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+// AIX-LABEL: define void @test_dmxxshapad(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 5)
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    ret void
+//
+void test_dmxxshapad(unsigned char *vdmrp, vector unsigned char vc, unsigned char *resp) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __builtin_mma_dmxxshapad(&vdmr, vc, 2, 1, 5);
+  *((__dmr1024 *)resp) = vdmr;
+}
 //.
 // CHECK: [[__VECTOR_PAIR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
@@ -216,6 +285,8 @@ void test_dmf_basic2(char *p1, char *res1, char *res2,
 // CHECK: [[__DMR1024_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
 // CHECK: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}
 // CHECK: [[CHAR_TBAA8]] = !{[[META4]], [[META4]], i64 0}
+// CHECK: [[__DMR2048_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// CHECK: [[META10]] = !{!"__dmr2048", [[META4]], i64 0}
 //.
 // AIX: [[__VECTOR_PAIR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // AIX: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
@@ -224,4 +295,6 @@ void test_dmf_basic2(char *p1, char *res1, char *res2,
 // AIX: [[__DMR1024_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
 // AIX: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}
 // AIX: [[CHAR_TBAA8]] = !{[[META4]], [[META4]], i64 0}
+// AIX: [[__DMR2048_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// AIX: [[META10]] = !{!"__dmr2048", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
index 0649755..66b9d79 100644
--- a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
@@ -9,7 +9,9 @@
 // RUN:   FileCheck --check-prefix=ISA_FUTURE %s
 
 //__attribute__((target("no-mma")))
-void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) {
+__attribute__((target("no-mma")))
+void test_mma(unsigned char *vdmrpp, unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) {
+  __dmr2048 vdmrpair = *((__dmr2048 *)vdmrpp);
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
   __builtin_mma_dmxvi8gerx4(&vdmr, vp, vc);
@@ -23,6 +25,9 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
   __builtin_mma_dmxor(&vdmr, (__dmr1024*)vpp);
   __builtin_mma_build_dmr(&vdmr, vc, vc, vc, vc, vc, vc, vc, vc);
   __builtin_mma_disassemble_dmr(vdmrp, &vdmr);
+  __builtin_mma_dmsha2hash(&vdmr, &vdmr, 0);
+  __builtin_mma_dmsha3hash(&vdmrpair, 0);
+  __builtin_mma_dmxxshapad(&vdmr, vc, 0, 0, 0);
 
 // CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops
 // CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops
@@ -35,6 +40,9 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
 // ISA_FUTURE: error: '__builtin_mma_dmxor' needs target feature mma,isa-future-instructions
 // ISA_FUTURE: error: '__builtin_mma_build_dmr' needs target feature mma,isa-future-instructions
 // ISA_FUTURE: error: '__builtin_mma_disassemble_dmr' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmsha2hash' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmsha3hash' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmxxshapad' needs target feature mma,isa-future-instructions
 
   // DMF VSX Vector bfloat16 GER 2x builtins.
 
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-types.c b/clang/test/CodeGen/PowerPC/ppc-dmf-types.c
index 9dff289..fbbe621 100644
--- a/clang/test/CodeGen/PowerPC/ppc-dmf-types.c
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-types.c
@@ -2,6 +2,162 @@
 // RUN: %clang_cc1 -triple powerpc64le-linux-unknown -target-cpu future \
 // RUN:   -emit-llvm -o - %s | FileCheck %s
 
+// CHECK-LABEL: @test_dmrp_copy(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR1_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR2_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR1:%.*]], ptr [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR2:%.*]], ptr [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <2048 x i1>, ptr [[TMP0]], i64 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2048 x i1>, ptr [[ADD_PTR]], align 256
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR1:%.*]] = getelementptr inbounds <2048 x i1>, ptr [[TMP2]], i64 1
+// CHECK-NEXT:    store <2048 x i1> [[TMP1]], ptr [[ADD_PTR1]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_copy(__dmr2048 *ptr1, __dmr2048 *ptr2) {
+  *(ptr2 + 1) = *(ptr1 + 2);
+}
+
+// CHECK-LABEL: @test_dmrp_typedef(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[OUTP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPIN:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPOUT:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[INP:%.*]], ptr [[INP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[OUTP:%.*]], ptr [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[INP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPIN]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[VDMRPOUT]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VDMRPIN]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2048 x i1>, ptr [[TMP2]], align 256
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VDMRPOUT]], align 8
+// CHECK-NEXT:    store <2048 x i1> [[TMP3]], ptr [[TMP4]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_typedef(int *inp, int *outp) {
+  __dmr2048 *vdmrpin = (__dmr2048 *)inp;
+  __dmr2048 *vdmrpout = (__dmr2048 *)outp;
+  *vdmrpout = *vdmrpin;
+}
+
+// CHECK-LABEL: @test_dmrp_arg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VDMRP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[VDMRP:%.*]], ptr [[VDMRP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2048 x i1>, ptr [[TMP1]], align 256
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    store <2048 x i1> [[TMP2]], ptr [[TMP3]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_arg(__dmr2048 *vdmrp, int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  *vdmrpp = *vdmrp;
+}
+
+// CHECK-LABEL: @test_dmrp_const_arg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VDMRP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[VDMRP:%.*]], ptr [[VDMRP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2048 x i1>, ptr [[TMP1]], align 256
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    store <2048 x i1> [[TMP2]], ptr [[TMP3]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_const_arg(const __dmr2048 *const vdmrp, int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  *vdmrpp = *vdmrp;
+}
+
+// CHECK-LABEL: @test_dmrp_array_arg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VDMRPA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[VDMRPA:%.*]], ptr [[VDMRPA_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRPA_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <2048 x i1>, ptr [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2048 x i1>, ptr [[ARRAYIDX]], align 256
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    store <2048 x i1> [[TMP2]], ptr [[TMP3]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_array_arg(__dmr2048 vdmrpa[], int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  *vdmrpp = vdmrpa[0];
+}
+
+// CHECK-LABEL: @test_dmrp_ret_const(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <2048 x i1>, ptr [[TMP1]], i64 2
+// CHECK-NEXT:    ret ptr [[ADD_PTR]]
+//
+const __dmr2048 *test_dmrp_ret_const(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  return vdmrpp + 2;
+}
+
+// CHECK-LABEL: @test_dmrp_sizeof_alignof(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRP:%.*]] = alloca <2048 x i1>, align 256
+// CHECK-NEXT:    [[SIZET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNT:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SIZEV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2048 x i1>, ptr [[TMP1]], align 256
+// CHECK-NEXT:    store <2048 x i1> [[TMP2]], ptr [[VDMRP]], align 256
+// CHECK-NEXT:    store i32 256, ptr [[SIZET]], align 4
+// CHECK-NEXT:    store i32 256, ptr [[ALIGNT]], align 4
+// CHECK-NEXT:    store i32 256, ptr [[SIZEV]], align 4
+// CHECK-NEXT:    store i32 256, ptr [[ALIGNV]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SIZET]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ALIGNT]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[SIZEV]], align 4
+// CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ADD]], [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ALIGNV]], align 4
+// CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP6]]
+// CHECK-NEXT:    ret i32 [[ADD2]]
+//
+int test_dmrp_sizeof_alignof(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp = *vdmrpp;
+  unsigned sizet = sizeof(__dmr2048);
+  unsigned alignt = __alignof__(__dmr2048);
+   unsigned sizev = sizeof(vdmrp);
+  unsigned alignv = __alignof__(vdmrp);
+  return sizet + alignt + sizev + alignv;
+}
 
 // CHECK-LABEL: @test_dmr_copy(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGen/X86/avxvnniint8-builtins.c b/clang/test/CodeGen/X86/avxvnniint8-builtins.c
index dd4a448..021e658 100644
--- a/clang/test/CodeGen/X86/avxvnniint8-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnniint8-builtins.c
@@ -10,73 +10,73 @@
 #include <immintrin.h>
 
 // CHECK-LABEL: test_mm_dpbssd_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbssd_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbssds_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbssds_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbssds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbsud_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbsud_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbsuds_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbsuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbsuds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbuud_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbuud_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbuuds_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbuuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbuuds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbssd_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbssd_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbssds_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbssds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbsud_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbsud_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbsuds_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbsuds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbuud_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbuud_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbuuds_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbuuds_epi32(__W, __A, __B);
 }
diff --git a/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp b/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp
index 1e213e7..6b792dc 100644
--- a/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp
+++ b/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp
@@ -7,6 +7,9 @@
 // RUN: %clang_cc1 -triple powerpc64le-linux-unknown -target-cpu pwr8 %s \
 // RUN:   -emit-llvm -o - | FileCheck %s
 
+// CHECK: _Z1fPu9__dmr2048
+void f(__dmr2048 *vdmrp) {}
+
 // CHECK: _Z2f0Pu9__dmr1024
 void f0(__dmr1024 *vdmr) {}
 
diff --git a/clang/test/Interpreter/execute-pch.cpp b/clang/test/Interpreter/execute-pch.cpp
new file mode 100644
index 0000000..8041ee6
--- /dev/null
+++ b/clang/test/Interpreter/execute-pch.cpp
@@ -0,0 +1,23 @@
+// REQUIRES: host-supports-jit
+// UNSUPPORTED: system-aix
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -fmax-type-align=16 -Xclang -fdeprecated-macro -fno-stack-protector -Xclang -fwrapv -Xclang -fblocks -Xclang -fskip-odr-check-in-gmf -fexceptions -fcxx-exceptions -fgnuc-version=0 -target %host-jit-triple -Xclang -fblocks -Xclang -fmax-type-align=8 -Xclang -fincremental-extensions -Xclang -emit-pch -x c++-header -o %t/include.pch %t/include.hpp
+//
+// RUN: cat %t/main.cpp \
+// RUN:     | clang-repl -Xcc -fgnuc-version=0 -Xcc -fno-stack-protector -Xcc -fwrapv -Xcc -fblocks -Xcc -fskip-odr-check-in-gmf -Xcc -fmax-type-align=8 -Xcc -include-pch -Xcc %t/include.pch \
+// RUN:     | FileCheck %s
+
+//--- include.hpp
+
+int f_pch() { return 5; }
+
+//--- main.cpp
+
+extern "C" int printf(const char *, ...);
+printf("f_pch = %d\n", f_pch());
+
+// CHECK: f_pch = 5
diff --git a/clang/test/Modules/cxx20-module-file-info-macros.cpp b/clang/test/Modules/cxx20-module-file-info-macros.cpp
index 3b67e9b..431c967 100644
--- a/clang/test/Modules/cxx20-module-file-info-macros.cpp
+++ b/clang/test/Modules/cxx20-module-file-info-macros.cpp
@@ -36,28 +36,28 @@
 #define REDEFINE
 
 // CHECK: Macro Definitions:
-// CHECK-DAG: REDEFINE
-// CHECK-DAG: FUNC_Macro
-// CHECK-DAG: CONSTANT
-// CHECK-DAG: FOO
+// CHECK: CONSTANT
+// CHECK: FOO
+// CHECK: FUNC_Macro
+// CHECK: REDEFINE
 // CHECK-NEXT: ===
 
 //--- include_foo.h
 #include "foo.h"
 #undef REDEFINE
 // CHECK: Macro Definitions:
-// CHECK-DAG: CONSTANT
-// CHECK-DAG: FUNC_Macro
-// CHECK-DAG: FOO
+// CHECK: CONSTANT
+// CHECK: FOO
+// CHECK: FUNC_Macro
 // CHECK-NEXT: ===
 
 //--- import_foo.h
 import "foo.h";
 #undef REDEFINE
 // CHECK: Macro Definitions:
-// CHECK-DAG: CONSTANT
-// CHECK-DAG: FUNC_Macro
-// CHECK-DAG: FOO
+// CHECK: CONSTANT
+// CHECK: FOO
+// CHECK: FUNC_Macro
 // CHECK-NEXT: ===
 
 //--- named_module.cppm
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 4090f3d..71d8453 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -21,6 +21,8 @@
 // CHECK-NOT: __riscv_mul {{.*$}}
 // CHECK-NOT: __riscv_muldiv {{.*$}}
 // CHECK-NOT: __riscv_q {{.*$}}
+// CHECK-NOT: __riscv_sdext{{.*$}}
+// CHECK-NOT: __riscv_sdtrig{{.*$}}
 // CHECK-NOT: __riscv_sha {{.*$}}
 // CHECK-NOT: __riscv_shcounterenw {{.*$}}
 // CHECK-NOT: __riscv_shgatpa {{.*$}}
@@ -33,8 +35,11 @@
 // CHECK-NOT: __riscv_smcdeleg {{.*$}}
 // CHECK-NOT: __riscv_smcntrpmf {{.*$}}
 // CHECK-NOT: __riscv_smcsrind {{.*$}}
+// CHECK-NOT: __riscv_smctr{{.*$}}
 // CHECK-NOT: __riscv_smdbltrp {{.*$}}
 // CHECK-NOT: __riscv_smepmp {{.*$}}
+// CHECK-NOT: __riscv_smmpm{{.*$}}
+// CHECK-NOT: __riscv_smnpm{{.*$}}
 // CHECK-NOT: __riscv_smrnmi {{.*$}}
 // CHECK-NOT: __riscv_smstateen {{.*$}}
 // CHECK-NOT: __riscv_ssaia {{.*$}}
@@ -43,7 +48,10 @@
 // CHECK-NOT: __riscv_sscofpmf {{.*$}}
 // CHECK-NOT: __riscv_sscounterenw {{.*$}}
 // CHECK-NOT: __riscv_sscsrind {{.*$}}
+// CHECK-NOT: __riscv_ssctr{{.*$}}
 // CHECK-NOT: __riscv_ssdbltrp {{.*$}}
+// CHECK-NOT: __riscv_ssnpm{{.*$}}
+// CHECK-NOT: __riscv_sspm{{.*$}}
 // CHECK-NOT: __riscv_ssqosid{{.*$}}
 // CHECK-NOT: __riscv_ssstateen {{.*$}}
 // CHECK-NOT: __riscv_ssstrict {{.*$}}
@@ -51,6 +59,7 @@
 // CHECK-NOT: __riscv_sstvala {{.*$}}
 // CHECK-NOT: __riscv_sstvecd {{.*$}}
 // CHECK-NOT: __riscv_ssu64xl {{.*$}}
+// CHECK-NOT: __riscv_supm{{.*$}}
 // CHECK-NOT: __riscv_svade {{.*$}}
 // CHECK-NOT: __riscv_svadu {{.*$}}
 // CHECK-NOT: __riscv_svbare {{.*$}}
@@ -91,6 +100,7 @@
 // CHECK-NOT: __riscv_zcmt {{.*$}}
 // CHECK-NOT: __riscv_zdinx {{.*$}}
 // CHECK-NOT: __riscv_zfa {{.*$}}
+// CHECK-NOT: __riscv_zfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zfh {{.*$}}
 // CHECK-NOT: __riscv_zfhmin {{.*$}}
 // CHECK-NOT: __riscv_zfinx {{.*$}}
@@ -126,6 +136,7 @@
 // CHECK-NOT: __riscv_zksh {{.*$}}
 // CHECK-NOT: __riscv_zkt {{.*$}}
 // CHECK-NOT: __riscv_zmmul {{.*$}}
+// CHECK-NOT: __riscv_ztso {{.*$}}
 // CHECK-NOT: __riscv_zvbb {{.*$}}
 // CHECK-NOT: __riscv_zvbc {{.*$}}
 // CHECK-NOT: __riscv_zve32f {{.*$}}
@@ -133,6 +144,8 @@
 // CHECK-NOT: __riscv_zve64d {{.*$}}
 // CHECK-NOT: __riscv_zve64f {{.*$}}
 // CHECK-NOT: __riscv_zve64x {{.*$}}
+// CHECK-NOT: __riscv_zvfbfmin {{.*$}}
+// CHECK-NOT: __riscv_zvfbfwma {{.*$}}
 // CHECK-NOT: __riscv_zvfh {{.*$}}
 // CHECK-NOT: __riscv_zvkb {{.*$}}
 // CHECK-NOT: __riscv_zvkg {{.*$}}
@@ -163,25 +176,12 @@
 
 // Experimental extensions
 
-// CHECK-NOT: __riscv_sdext{{.*$}}
-// CHECK-NOT: __riscv_sdtrig{{.*$}}
-// CHECK-NOT: __riscv_smctr{{.*$}}
-// CHECK-NOT: __riscv_smmpm{{.*$}}
-// CHECK-NOT: __riscv_smnpm{{.*$}}
-// CHECK-NOT: __riscv_ssctr{{.*$}}
-// CHECK-NOT: __riscv_ssnpm{{.*$}}
-// CHECK-NOT: __riscv_sspm{{.*$}}
-// CHECK-NOT: __riscv_supm{{.*$}}
 // CHECK-NOT: __riscv_zalasr {{.*$}}
-// CHECK-NOT: __riscv_zfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zicfilp {{.*$}}
 // CHECK-NOT: __riscv_zicfiss {{.*$}}
-// CHECK-NOT: __riscv_ztso {{.*$}}
 // CHECK-NOT: __riscv_zvbc32e {{.*$}}
 // CHECK-NOT: __riscv_zvfbfa {{.*$}}
 // CHECK-NOT: __riscv_zvfofp8min {{.*$}}
-// CHECK-NOT: __riscv_zvfbfmin {{.*$}}
-// CHECK-NOT: __riscv_zvfbfwma {{.*$}}
 // CHECK-NOT: __riscv_zvkgs {{.*$}}
 // CHECK-NOT: __riscv_zvqdotq {{.*$}}
 
diff --git a/clang/test/Sema/ppc-dmf-types.c b/clang/test/Sema/ppc-dmf-types.c
index b3da72d..88926ac 100644
--- a/clang/test/Sema/ppc-dmf-types.c
+++ b/clang/test/Sema/ppc-dmf-types.c
@@ -12,47 +12,86 @@
 
 // typedef
 typedef __dmr1024 dmr_t;
+typedef __dmr2048 dmrp_t;
 
 // function argument
-void testDmrArg1(__dmr1024 vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+void testDmrArg1(dmr_t vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmr_t *vdmrp = (dmr_t *)ptr;
   *vdmrp = vdmr;
 }
 
-void testDmrArg2(const __dmr1024 vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+void testDmrArg2(const dmr_t vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmr_t *vdmrp = (dmr_t *)ptr;
   *vdmrp = vdmr;
 }
 
 void testDmrArg3(const dmr_t vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  dmr_t *vdmrp = (dmr_t *)ptr;
   *vdmrp = vdmr;
 }
 
+void testDmrPArg1(const dmrp_t vdmrp, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  *vdmrpp = vdmrp;
+}
+
+void testDmrPArg2(const dmrp_t vdmrp, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  *vdmrpp = vdmrp;
+}
+
+void testDmrPArg3(const dmrp_t vdmrp, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  *vdmrpp = vdmrp;
+}
+
 // function return
-__dmr1024 testDmrRet1(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+dmr_t testDmrRet1(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmr_t *vdmrp = (dmr_t *)ptr;
   return *vdmrp; // expected-error {{invalid use of PPC MMA type}}
 }
 
 const dmr_t testDmrRet4(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  dmr_t *vdmrp = (dmr_t *)ptr;
   return *vdmrp; // expected-error {{invalid use of PPC MMA type}}
 }
 
+dmrp_t testDmrPRet1(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  return *vdmrpp; // expected-error {{invalid use of PPC MMA type}}
+}
+
+const dmrp_t testDmrPRet4(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  return *vdmrpp; // expected-error {{invalid use of PPC MMA type}}
+}
+
 // global
-__dmr1024 globalvdmr;        // expected-error {{invalid use of PPC MMA type}}
-const __dmr1024 globalvdmr2; // expected-error {{invalid use of PPC MMA type}}
-__dmr1024 *globalvdmrp;
-const __dmr1024 *const globalvdmrp2;
+dmr_t globalvdmr;        // expected-error {{invalid use of PPC MMA type}}
+const dmr_t globalvdmr2; // expected-error {{invalid use of PPC MMA type}}
+dmr_t *globalvdmrp;
+const dmr_t *const globalvdmrp2;
 dmr_t globalvdmr_t; // expected-error {{invalid use of PPC MMA type}}
 
+dmrp_t globalvdmrp;        // expected-error {{invalid use of PPC MMA type}}
+const dmrp_t globalvdmrp2; // expected-error {{invalid use of PPC MMA type}}
+dmrp_t *globalvdmrpp;
+const dmrp_t *const globalvdmrpp2;
+dmrp_t globalvdmrp_t; // expected-error {{invalid use of PPC MMA type}}
+
 // struct field
 struct TestDmrStruct {
   int a;
   float b;
-  __dmr1024 c; // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vq;
+  dmr_t c; // expected-error {{invalid use of PPC MMA type}}
+  dmr_t *vq;
+};
+
+struct TestDmrPStruct {
+  int a;
+  float b;
+  dmrp_t c; // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vq;
 };
 
 // operators
@@ -101,3 +140,50 @@ void testDmrOperators4(int v, void *ptr) {
   __dmr1024 vdmr1 = (__dmr1024)v;   // expected-error {{used type '__dmr1024' where arithmetic or pointer type is required}}
   __dmr1024 vdmr2 = (__dmr1024)vdmrp; // expected-error {{used type '__dmr1024' where arithmetic or pointer type is required}}
 }
+
+int testDmrPOperators1(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp1 = *(vdmrpp + 0);
+  __dmr2048 vdmrp2 = *(vdmrpp + 1);
+  __dmr2048 vdmrp3 = *(vdmrpp + 2);
+  if (vdmrp1) // expected-error {{statement requires expression of scalar type ('__dmr2048' invalid)}}
+    *(vdmrpp + 10) = vdmrp1;
+  if (!vdmrp2) // expected-error {{invalid argument type '__dmr2048' to unary expression}}
+    *(vdmrpp + 11) = vdmrp3;
+  int c1 = vdmrp1 && vdmrp2; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  int c2 = vdmrp2 == vdmrp3; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  int c3 = vdmrp2 < vdmrp1;  // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  return c1 || c2 || c3;
+}
+
+void testDmrPOperators2(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp1 = *(vdmrpp + 0);
+  __dmr2048 vdmrp2 = *(vdmrpp + 1);
+  __dmr2048 vdmrp3 = *(vdmrpp + 2);
+  vdmrp1 = -vdmrp1;        // expected-error {{invalid argument type '__dmr2048' to unary expression}}
+  vdmrp2 = vdmrp1 + vdmrp3; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  vdmrp2 = vdmrp2 * vdmrp3; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  vdmrp3 = vdmrp3 | vdmrp3; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  vdmrp3 = vdmrp3 << 2;    // expected-error {{invalid operands to binary expression ('__dmr2048' and 'int')}}
+  *(vdmrpp + 10) = vdmrp1;
+  *(vdmrpp + 11) = vdmrp2;
+  *(vdmrpp + 12) = vdmrp3;
+}
+
+
+vector unsigned char testDmrPOperators3(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp1 = *(vdmrpp + 0);
+  __dmr2048 vdmrp2 = *(vdmrpp + 1);
+  __dmr2048 vdmrp3 = *(vdmrpp + 2);
+  vdmrp1 ? *(vdmrpp + 10) = vdmrp2 : *(vdmrpp + 11) = vdmrp3; // expected-error {{used type '__dmr2048' where arithmetic or pointer type is required}}
+  vdmrp2 = vdmrp3;
+  return vdmrp2[1]; // expected-error {{subscripted value is not an array, pointer, or vector}}
+}
+
+void testDmrPOperators4(int v, void *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp1 = (__dmr2048)v;   // expected-error {{used type '__dmr2048' where arithmetic or pointer type is required}}
+  __dmr2048 vdmrp2 = (__dmr2048)vdmrpp; // expected-error {{used type '__dmr2048' where arithmetic or pointer type is required}}
+}
diff --git a/clang/test/SemaCUDA/vararg.cu b/clang/test/SemaCUDA/vararg.cu
index 34ef367..0238f42 100644
--- a/clang/test/SemaCUDA/vararg.cu
+++ b/clang/test/SemaCUDA/vararg.cu
@@ -10,7 +10,7 @@
 #include <stdarg.h>
 #include "Inputs/cuda.h"
 
-__device__ void foo() {
+__global__ void foo() {
   va_list list;
   va_arg(list, int);
 #ifdef EXPECT_VA_ARG_ERR
diff --git a/clang/test/SemaCXX/bitfield-layout.cpp b/clang/test/SemaCXX/bitfield-layout.cpp
index 7efd1d3..f30218b 100644
--- a/clang/test/SemaCXX/bitfield-layout.cpp
+++ b/clang/test/SemaCXX/bitfield-layout.cpp
@@ -35,7 +35,7 @@ CHECK_SIZE(Test4, 8);
 CHECK_ALIGN(Test4, 8);
 
 struct Test5 {
-  char c : 0x100000001; // expected-warning {{width of bit-field 'c' (4294967297 bits) exceeds the width of its type; value will be truncated to 8 bits}}
+  char c : 0x100000001; // expected-warning {{width of bit-field 'c' (4'294'967'297 bits) exceeds the width of its type; value will be truncated to 8 bits}}
 };
 // Size and align don't really matter here, just make sure we don't crash.
 CHECK_SIZE(Test5, 1);
diff --git a/clang/test/SemaCXX/type-trait-synthesises-from-spaceship.cpp b/clang/test/SemaCXX/type-trait-synthesizes-from-spaceship.cpp
index ba58147..be312f4 100644
--- a/clang/test/SemaCXX/type-trait-synthesises-from-spaceship.cpp
+++ b/clang/test/SemaCXX/type-trait-synthesizes-from-spaceship.cpp
@@ -1,24 +1,24 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
 
-static_assert(!__builtin_lt_synthesises_from_spaceship()); // expected-error {{expected a type}}
-static_assert(!__builtin_lt_synthesises_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
-static_assert(!__builtin_lt_synthesises_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
-static_assert(!__builtin_lt_synthesises_from_spaceship(int, 0)); // expected-error {{expected a type}}
-
-static_assert(!__builtin_le_synthesises_from_spaceship()); // expected-error {{expected a type}}
-static_assert(!__builtin_le_synthesises_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
-static_assert(!__builtin_le_synthesises_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
-static_assert(!__builtin_le_synthesises_from_spaceship(int, 0)); // expected-error {{expected a type}}
-
-static_assert(!__builtin_gt_synthesises_from_spaceship()); // expected-error {{expected a type}}
-static_assert(!__builtin_gt_synthesises_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
-static_assert(!__builtin_gt_synthesises_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
-static_assert(!__builtin_gt_synthesises_from_spaceship(int, 0)); // expected-error {{expected a type}}
-
-static_assert(!__builtin_ge_synthesises_from_spaceship()); // expected-error {{expected a type}}
-static_assert(!__builtin_ge_synthesises_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
-static_assert(!__builtin_ge_synthesises_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
-static_assert(!__builtin_ge_synthesises_from_spaceship(int, 0)); // expected-error {{expected a type}}
+static_assert(!__builtin_lt_synthesizes_from_spaceship()); // expected-error {{expected a type}}
+static_assert(!__builtin_lt_synthesizes_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
+static_assert(!__builtin_lt_synthesizes_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
+static_assert(!__builtin_lt_synthesizes_from_spaceship(int, 0)); // expected-error {{expected a type}}
+
+static_assert(!__builtin_le_synthesizes_from_spaceship()); // expected-error {{expected a type}}
+static_assert(!__builtin_le_synthesizes_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
+static_assert(!__builtin_le_synthesizes_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
+static_assert(!__builtin_le_synthesizes_from_spaceship(int, 0)); // expected-error {{expected a type}}
+
+static_assert(!__builtin_gt_synthesizes_from_spaceship()); // expected-error {{expected a type}}
+static_assert(!__builtin_gt_synthesizes_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
+static_assert(!__builtin_gt_synthesizes_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
+static_assert(!__builtin_gt_synthesizes_from_spaceship(int, 0)); // expected-error {{expected a type}}
+
+static_assert(!__builtin_ge_synthesizes_from_spaceship()); // expected-error {{expected a type}}
+static_assert(!__builtin_ge_synthesizes_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
+static_assert(!__builtin_ge_synthesizes_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
+static_assert(!__builtin_ge_synthesizes_from_spaceship(int, 0)); // expected-error {{expected a type}}
 
 namespace std {
   struct strong_ordering {
@@ -35,10 +35,10 @@ struct DefaultSpaceship {
   friend auto operator<=>(DefaultSpaceship, DefaultSpaceship) = default;
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
 
 struct CustomSpaceship {
   int i;
@@ -48,10 +48,10 @@ struct CustomSpaceship {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
 
 struct CustomLT {
   int i;
@@ -61,10 +61,10 @@ struct CustomLT {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomLT&, const CustomLT&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomLT&, const CustomLT&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomLT&, const CustomLT&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomLT&, const CustomLT&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomLT&, const CustomLT&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomLT&, const CustomLT&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomLT&, const CustomLT&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomLT&, const CustomLT&));
 
 struct CustomLE {
   int i;
@@ -74,10 +74,10 @@ struct CustomLE {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomLE&, const CustomLE&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomLE&, const CustomLE&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomLE&, const CustomLE&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomLE&, const CustomLE&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomLE&, const CustomLE&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomLE&, const CustomLE&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomLE&, const CustomLE&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomLE&, const CustomLE&));
 
 struct CustomGT {
   int i;
@@ -87,10 +87,10 @@ struct CustomGT {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomGT&, const CustomGT&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomGT&, const CustomGT&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomGT&, const CustomGT&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomGT&, const CustomGT&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomGT&, const CustomGT&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomGT&, const CustomGT&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomGT&, const CustomGT&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomGT&, const CustomGT&));
 
 struct CustomGE {
   int i;
@@ -100,10 +100,10 @@ struct CustomGE {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomGE&, const CustomGE&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomGE&, const CustomGE&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomGE&, const CustomGE&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomGE&, const CustomGE&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomGE&, const CustomGE&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomGE&, const CustomGE&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomGE&, const CustomGE&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomGE&, const CustomGE&));
 
 struct CustomLTAndSpaceship {
   int i;
@@ -117,10 +117,10 @@ struct CustomLTAndSpaceship {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
 
 struct CustomLEAndSpaceship {
   int i;
@@ -134,10 +134,10 @@ struct CustomLEAndSpaceship {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
 
 struct CustomGTAndSpaceship {
   int i;
@@ -151,10 +151,10 @@ struct CustomGTAndSpaceship {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
 
 struct CustomGEAndSpaceship {
   int i;
@@ -168,10 +168,10 @@ struct CustomGEAndSpaceship {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
 
 struct DefaultedCmpAndSpaceship {
   int i;
@@ -187,10 +187,10 @@ struct DefaultedCmpAndSpaceship {
 };
 
 // TODO: This should probably return true
-static_assert(!__builtin_lt_synthesises_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
 
 struct DifferentTypes {
   int i;
@@ -200,13 +200,13 @@ struct DifferentTypes {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const DifferentTypes&, const int&));
-static_assert(__builtin_le_synthesises_from_spaceship(const DifferentTypes&, const int&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const DifferentTypes&, const int&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const DifferentTypes&, const int&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const DifferentTypes&, const int&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const DifferentTypes&, const int&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const DifferentTypes&, const int&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const DifferentTypes&, const int&));
 
 // TODO: Should this return true? It's technically not synthesized from spaceship, but it behaves exactly as-if it was
-static_assert(!__builtin_lt_synthesises_from_spaceship(int, int));
-static_assert(!__builtin_le_synthesises_from_spaceship(int, int));
-static_assert(!__builtin_gt_synthesises_from_spaceship(int, int));
-static_assert(!__builtin_ge_synthesises_from_spaceship(int, int));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(int, int));
+static_assert(!__builtin_le_synthesizes_from_spaceship(int, int));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(int, int));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(int, int));
diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py
index 4a5d9e5..64e2bba 100644
--- a/clang/test/lit.cfg.py
+++ b/clang/test/lit.cfg.py
@@ -140,7 +140,8 @@ def have_host_out_of_process_jit_feature_support():
 
     return False
 
-def have_host_jit_feature_support(feature_name):
+
+def run_clang_repl(args):
     clang_repl_exe = lit.util.which("clang-repl", config.clang_tools_dir)
 
     if not clang_repl_exe:
@@ -148,7 +149,7 @@ def have_host_jit_feature_support(feature_name):
 
     try:
         clang_repl_cmd = subprocess.Popen(
-            [clang_repl_exe, "--host-supports-" + feature_name], stdout=subprocess.PIPE
+            [clang_repl_exe, args], stdout=subprocess.PIPE
         )
     except OSError:
         print("could not exec clang-repl")
@@ -157,7 +158,11 @@ def have_host_jit_feature_support(feature_name):
     clang_repl_out = clang_repl_cmd.stdout.read().decode("ascii")
     clang_repl_cmd.wait()
 
-    return "true" in clang_repl_out
+    return clang_repl_out
+
+
+def have_host_jit_feature_support(feature_name):
+    return "true" in run_clang_repl("--host-supports-" + feature_name)
 
 def have_host_clang_repl_cuda():
     clang_repl_exe = lit.util.which('clang-repl', config.clang_tools_dir)
@@ -191,6 +196,8 @@ if have_host_jit_feature_support('jit'):
 
     if have_host_clang_repl_cuda():
         config.available_features.add('host-supports-cuda')
+    hosttriple = run_clang_repl("--host-jit-triple")
+    config.substitutions.append(("%host-jit-triple", hosttriple.strip()))
 
     if have_host_out_of_process_jit_feature_support():
         config.available_features.add("host-supports-out-of-process-jit")
diff --git a/clang/tools/clang-repl/ClangRepl.cpp b/clang/tools/clang-repl/ClangRepl.cpp
index 1d50881..c787942 100644
--- a/clang/tools/clang-repl/ClangRepl.cpp
+++ b/clang/tools/clang-repl/ClangRepl.cpp
@@ -85,6 +85,8 @@ static llvm::cl::list<std::string>
               llvm::cl::CommaSeparated);
 static llvm::cl::opt<bool> OptHostSupportsJit("host-supports-jit",
                                               llvm::cl::Hidden);
+static llvm::cl::opt<bool> OptHostJitTriple("host-jit-triple",
+                                            llvm::cl::Hidden);
 static llvm::cl::list<std::string> OptInputs(llvm::cl::Positional,
                                              llvm::cl::desc("[code to run]"));
 
@@ -279,6 +281,11 @@ int main(int argc, const char **argv) {
       llvm::outs() << "false\n";
     }
     return 0;
+  } else if (OptHostJitTriple) {
+    auto J = ExitOnErr(llvm::orc::LLJITBuilder().create());
+    auto T = J->getTargetTriple();
+    llvm::outs() << T.normalize() << '\n';
+    return 0;
   }
 
   clang::IncrementalCompilerBuilder CB;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 6a3385a..fef7036 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -1364,6 +1364,27 @@ TEST_F(FormatTest, FormatIfWithoutCompoundStatementButElseWith) {
                AllowsMergedIf);
 }
 
+TEST_F(FormatTest, WrapMultipleStatementIfAndElseBraces) {
+  auto Style = getLLVMStyle();
+  Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Always;
+  Style.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_AllIfsAndElse;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterControlStatement = FormatStyle::BWACS_Always;
+  Style.BraceWrapping.BeforeElse = true;
+
+  verifyFormat("if (x)\n"
+               "{\n"
+               "  ++x;\n"
+               "  --y;\n"
+               "}\n"
+               "else\n"
+               "{\n"
+               "  --x;\n"
+               "  ++y;\n"
+               "}",
+               Style);
+}
+
 TEST_F(FormatTest, FormatLoopsWithoutCompoundStatement) {
   verifyFormat("while (true)\n"
                "  ;");
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index a96d325..b10ce7f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -1326,7 +1326,7 @@ PRCTL_INTERCEPTOR(int, prctl, int option, unsigned long arg2,
   static const int PR_SET_SECCOMP = 22;
   static const int SECCOMP_MODE_FILTER = 2;
 #  endif
-  if (option == PR_SET_VMA && arg2 == 0UL) {
+  if (option == PR_SET_VMA && arg2 == 0UL && arg5 != 0UL) {
     char *name = (char *)arg5;
     COMMON_INTERCEPTOR_READ_RANGE(ctx, name, internal_strlen(name) + 1);
   }
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
index dab1d1b..afce9dc 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
@@ -88,5 +88,8 @@ int main() {
   res = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &pr);
   assert(res == -1);
 
+  unsigned long name = reinterpret_cast<unsigned long>(nullptr);
+  prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, 0, nullptr, name);
+
   return 0;
 }
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index f9c41b3..682dd82 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -158,7 +158,8 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
 void createDebugPasses(mlir::PassManager &pm,
                        llvm::codegenoptions::DebugInfoKind debugLevel,
                        llvm::OptimizationLevel OptLevel,
-                       llvm::StringRef inputFilename, int32_t dwarfVersion);
+                       llvm::StringRef inputFilename, int32_t dwarfVersion,
+                       llvm::StringRef splitDwarfFile);
 
 void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
                                          MLIRToLLVMPassPipelineConfig config,
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 88573fa..e2ba9c3 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -246,6 +246,10 @@ def AddDebugInfo : Pass<"add-debug-info", "mlir::ModuleOp"> {
            "int32_t",
            /*default=*/"0",
            "dwarf version">,
+    Option<"splitDwarfFile", "split-dwarf-file",
+           "std::string", /*default=*/"std::string{}",
+           "Name of the split dwarf file">
+
   ];
 }
 
diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h
index b8f3559..f761332 100644
--- a/flang/include/flang/Parser/openmp-utils.h
+++ b/flang/include/flang/Parser/openmp-utils.h
@@ -39,7 +39,6 @@ struct ConstructId {
   }
 
 MAKE_CONSTR_ID(OpenMPDeclarativeAllocate, D::OMPD_allocate);
-MAKE_CONSTR_ID(OpenMPDeclareReductionConstruct, D::OMPD_declare_reduction);
 MAKE_CONSTR_ID(OpenMPExecutableAllocate, D::OMPD_allocate);
 
 #undef MAKE_CONSTR_ID
@@ -92,7 +91,6 @@ struct DirectiveNameScope {
       if constexpr (std::is_base_of_v<OmpBlockConstruct, T>) {
         return std::get<OmpBeginDirective>(x.t).DirName();
       } else if constexpr (std::is_same_v<T, OpenMPDeclarativeAllocate> ||
-          std::is_same_v<T, OpenMPDeclareReductionConstruct> ||
           std::is_same_v<T, OpenMPExecutableAllocate>) {
         return MakeName(std::get<Verbatim>(x.t).source, ConstructId<T>::id);
       } else {
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index 01c34ee..850bd1f 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -109,6 +109,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
       InstrumentFunctionExit = "__cyg_profile_func_exit";
     }
     DwarfVersion = opts.DwarfVersion;
+    SplitDwarfFile = opts.SplitDwarfFile;
   }
 
   llvm::OptimizationLevel OptLevel; ///< optimisation level
@@ -146,6 +147,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
       Fortran::frontend::CodeGenOptions::ComplexRangeKind::
           CX_Full; ///< Method for calculating complex number division
   int32_t DwarfVersion = 0; ///< Version of DWARF debug info to generate
+  std::string SplitDwarfFile = ""; ///< File name for the split debug info
 };
 
 struct OffloadModuleOpts {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index 9969ee4..d8e36ea 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -2284,213 +2284,6 @@ public:
   }
 };
 
-static std::pair<mlir::Value, hlfir::AssociateOp>
-getVariable(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value val) {
-  // If it is an expression - create a variable from it, or forward
-  // the value otherwise.
-  hlfir::AssociateOp associate;
-  if (!mlir::isa<hlfir::ExprType>(val.getType()))
-    return {val, associate};
-  hlfir::Entity entity{val};
-  mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
-  associate = hlfir::genAssociateExpr(loc, builder, entity, entity.getType(),
-                                      "", byRefAttr);
-  return {associate.getBase(), associate};
-}
-
-class IndexOpConversion : public mlir::OpRewritePattern<hlfir::IndexOp> {
-public:
-  using mlir::OpRewritePattern<hlfir::IndexOp>::OpRewritePattern;
-
-  llvm::LogicalResult
-  matchAndRewrite(hlfir::IndexOp op,
-                  mlir::PatternRewriter &rewriter) const override {
-    // We simplify only limited cases:
-    // 1) a substring length shall be known at compile time
-    // 2) if a substring length is 0 then replace with 1 for forward search,
-    //    or otherwise with the string length + 1 (builder shall const-fold if
-    //    lookup direction is known at compile time).
-    // 3) for known string length at compile time, if it is
-    //    shorter than substring  => replace with zero.
-    // 4) if a substring length is one => inline as simple search loop
-    // 5) for forward search with input strings of kind=1 runtime is faster.
-    // Do not simplify in all the other cases relying on a runtime call.
-
-    fir::FirOpBuilder builder{rewriter, op.getOperation()};
-    const mlir::Location &loc = op->getLoc();
-
-    auto resultTy = op.getType();
-    mlir::Value back = op.getBack();
-    mlir::Value substrLen =
-        hlfir::genCharLength(loc, builder, hlfir::Entity{op.getSubstr()});
-
-    auto substrLenCst = fir::getIntIfConstant(substrLen);
-    if (!substrLenCst) {
-      return rewriter.notifyMatchFailure(
-          op, "substring length unknown at compile time");
-    }
-    mlir::Value strLen =
-        hlfir::genCharLength(loc, builder, hlfir::Entity{op.getStr()});
-    auto i1Ty = builder.getI1Type();
-    auto idxTy = builder.getIndexType();
-    if (*substrLenCst == 0) {
-      mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
-      // zero length substring. For back search replace with
-      // strLen+1, or otherwise with 1.
-      mlir::Value strEnd = mlir::arith::AddIOp::create(
-          builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
-      if (back)
-        back = builder.createConvert(loc, i1Ty, back);
-      else
-        back = builder.createIntegerConstant(loc, i1Ty, 0);
-      mlir::Value result =
-          mlir::arith::SelectOp::create(builder, loc, back, strEnd, oneIdx);
-
-      rewriter.replaceOp(op, builder.createConvert(loc, resultTy, result));
-      return mlir::success();
-    }
-
-    if (auto strLenCst = fir::getIntIfConstant(strLen)) {
-      if (*strLenCst < *substrLenCst) {
-        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 0));
-        return mlir::success();
-      }
-      if (*strLenCst == 0) {
-        // both strings have zero length
-        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 1));
-        return mlir::success();
-      }
-    }
-    if (*substrLenCst != 1) {
-      return rewriter.notifyMatchFailure(
-          op, "rely on runtime implementation if substring length > 1");
-    }
-    // For forward search and character kind=1 the runtime uses memchr
-    // which well optimized. But it looks like memchr idiom is not recognized
-    // in LLVM yet. On a micro-kernel test with strings of length 40 runtime
-    // had ~2x less execution time vs inlined code. For unknown search direction
-    // at compile time pessimistically assume "forward".
-    std::optional<bool> isBack;
-    if (back) {
-      if (auto backCst = fir::getIntIfConstant(back))
-        isBack = *backCst != 0;
-    } else {
-      isBack = false;
-    }
-    auto charTy = mlir::cast<fir::CharacterType>(
-        hlfir::getFortranElementType(op.getSubstr().getType()));
-    unsigned kind = charTy.getFKind();
-    if (kind == 1 && (!isBack || !*isBack)) {
-      return rewriter.notifyMatchFailure(
-          op, "rely on runtime implementation for character kind 1");
-    }
-
-    // All checks are passed here. Generate single character search loop.
-    auto [strV, strAssociate] = getVariable(builder, loc, op.getStr());
-    auto [substrV, substrAssociate] = getVariable(builder, loc, op.getSubstr());
-    hlfir::Entity str{strV};
-    hlfir::Entity substr{substrV};
-    mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
-
-    auto genExtractAndConvertToInt = [&charTy, &idxTy, &oneIdx,
-                                      kind](mlir::Location loc,
-                                            fir::FirOpBuilder &builder,
-                                            hlfir::Entity &charStr,
-                                            mlir::Value index) {
-      auto bits = builder.getKindMap().getCharacterBitsize(kind);
-      auto intTy = builder.getIntegerType(bits);
-      auto charLen1Ty =
-          fir::CharacterType::getSingleton(builder.getContext(), kind);
-      mlir::Type designatorTy =
-          fir::ReferenceType::get(charLen1Ty, fir::isa_volatile_type(charTy));
-      auto idxAttr = builder.getIntegerAttr(idxTy, 0);
-
-      auto singleChr = hlfir::DesignateOp::create(
-          builder, loc, designatorTy, charStr, /*component=*/{},
-          /*compShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
-          /*substring=*/mlir::ValueRange{index, index},
-          /*complexPart=*/std::nullopt,
-          /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{oneIdx},
-          fir::FortranVariableFlagsAttr{});
-      auto chrVal = fir::LoadOp::create(builder, loc, singleChr);
-      mlir::Value intVal = fir::ExtractValueOp::create(
-          builder, loc, intTy, chrVal, builder.getArrayAttr(idxAttr));
-      return intVal;
-    };
-
-    auto wantChar = genExtractAndConvertToInt(loc, builder, substr, oneIdx);
-
-    // Generate search loop body with the following C equivalent:
-    //  idx_t result = 0;
-    //  idx_t end = strlen + 1;
-    //  char want = substr[0];
-    //  for (idx_t idx = 1; idx < end; ++idx) {
-    //    if (result == 0) {
-    //        idx_t at = back ? end - idx: idx;
-    //        result = str[at-1] == want ? at : result;
-    //    }
-    //  }
-    if (!back)
-      back = builder.createIntegerConstant(loc, i1Ty, 0);
-    else
-      back = builder.createConvert(loc, i1Ty, back);
-    mlir::Value strEnd = mlir::arith::AddIOp::create(
-        builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
-    mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
-    auto genSearchBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
-                             mlir::ValueRange index,
-                             mlir::ValueRange reductionArgs)
-        -> llvm::SmallVector<mlir::Value, 1> {
-      assert(index.size() == 1 && "expected single loop");
-      assert(reductionArgs.size() == 1 && "expected single reduction value");
-      mlir::Value inRes = reductionArgs[0];
-      auto resEQzero = mlir::arith::CmpIOp::create(
-          builder, loc, mlir::arith::CmpIPredicate::eq, inRes, zeroIdx);
-
-      mlir::Value res =
-          builder
-              .genIfOp(loc, {idxTy}, resEQzero,
-                       /*withElseRegion=*/true)
-              .genThen([&]() {
-                mlir::Value idx = builder.createConvert(loc, idxTy, index[0]);
-                // offset = back ? end - idx : idx;
-                mlir::Value offset = mlir::arith::SelectOp::create(
-                    builder, loc, back,
-                    mlir::arith::SubIOp::create(builder, loc, strEnd, idx),
-                    idx);
-
-                auto haveChar =
-                    genExtractAndConvertToInt(loc, builder, str, offset);
-                auto charsEQ = mlir::arith::CmpIOp::create(
-                    builder, loc, mlir::arith::CmpIPredicate::eq, haveChar,
-                    wantChar);
-                mlir::Value newVal = mlir::arith::SelectOp::create(
-                    builder, loc, charsEQ, offset, inRes);
-
-                fir::ResultOp::create(builder, loc, newVal);
-              })
-              .genElse([&]() { fir::ResultOp::create(builder, loc, inRes); })
-              .getResults()[0];
-      return {res};
-    };
-
-    llvm::SmallVector<mlir::Value, 1> loopOut =
-        hlfir::genLoopNestWithReductions(loc, builder, {strLen},
-                                         /*reductionInits=*/{zeroIdx},
-                                         genSearchBody,
-                                         /*isUnordered=*/false);
-    mlir::Value result = builder.createConvert(loc, resultTy, loopOut[0]);
-
-    if (strAssociate)
-      hlfir::EndAssociateOp::create(builder, loc, strAssociate);
-    if (substrAssociate)
-      hlfir::EndAssociateOp::create(builder, loc, substrAssociate);
-
-    rewriter.replaceOp(op, result);
-    return mlir::success();
-  }
-};
-
 template <typename Op>
 class MatmulConversion : public mlir::OpRewritePattern<Op> {
 public:
@@ -3162,7 +2955,6 @@ public:
     patterns.insert<ArrayShiftConversion<hlfir::CShiftOp>>(context);
     patterns.insert<ArrayShiftConversion<hlfir::EOShiftOp>>(context);
     patterns.insert<CmpCharOpConversion>(context);
-    patterns.insert<IndexOpConversion>(context);
     patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);
     patterns.insert<ReductionConversion<hlfir::CountOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AnyOp>>(context);
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index c089941..a83b066 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -95,12 +95,14 @@ getEmissionKind(llvm::codegenoptions::DebugInfoKind kind) {
 void addDebugInfoPass(mlir::PassManager &pm,
                       llvm::codegenoptions::DebugInfoKind debugLevel,
                       llvm::OptimizationLevel optLevel,
-                      llvm::StringRef inputFilename, int32_t dwarfVersion) {
+                      llvm::StringRef inputFilename, int32_t dwarfVersion,
+                      llvm::StringRef splitDwarfFile) {
   fir::AddDebugInfoOptions options;
   options.debugLevel = getEmissionKind(debugLevel);
   options.isOptimized = optLevel != llvm::OptimizationLevel::O0;
   options.inputFilename = inputFilename;
   options.dwarfVersion = dwarfVersion;
+  options.splitDwarfFile = splitDwarfFile;
   addPassConditionally(pm, disableDebugInfo,
                        [&]() { return fir::createAddDebugInfoPass(options); });
 }
@@ -340,9 +342,11 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
 void createDebugPasses(mlir::PassManager &pm,
                        llvm::codegenoptions::DebugInfoKind debugLevel,
                        llvm::OptimizationLevel OptLevel,
-                       llvm::StringRef inputFilename, int32_t dwarfVersion) {
+                       llvm::StringRef inputFilename, int32_t dwarfVersion,
+                       llvm::StringRef splitDwarfFile) {
   if (debugLevel != llvm::codegenoptions::NoDebugInfo)
-    addDebugInfoPass(pm, debugLevel, OptLevel, inputFilename, dwarfVersion);
+    addDebugInfoPass(pm, debugLevel, OptLevel, inputFilename, dwarfVersion,
+                     splitDwarfFile);
 }
 
 void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
@@ -360,7 +364,7 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
       pm, (config.DebugInfo != llvm::codegenoptions::NoDebugInfo));
   fir::addExternalNameConversionPass(pm, config.Underscoring);
   fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename,
-                         config.DwarfVersion);
+                         config.DwarfVersion, config.SplitDwarfFile);
   fir::addTargetRewritePass(pm);
   fir::addCompilerGeneratedNamesConversionPass(pm);
 
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index bc6592d..bdf7e4a 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -696,7 +696,8 @@ void AddDebugInfoPass::runOnOperation() {
       llvm::dwarf::getLanguage("DW_LANG_Fortran95"), fileAttr, producer,
       isOptimized, debugLevel,
       /*nameTableKind=*/mlir::LLVM::DINameTableKind::Default,
-      /*splitDebugFilename=*/mlir::StringAttr());
+      splitDwarfFile.empty() ? mlir::StringAttr()
+                             : mlir::StringAttr::get(context, splitDwarfFile));
 
   module.walk([&](mlir::func::FuncOp funcOp) {
     handleFuncOp(funcOp, fileAttr, cuAttr, typeGen, &symbolTable);
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index d038c46..a7e4723 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -679,26 +679,37 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType(
       /*optional<address space>=*/std::nullopt, /*extra data=*/nullptr);
 }
 
+static mlir::StringAttr getBasicTypeName(mlir::MLIRContext *context,
+                                         llvm::StringRef baseName,
+                                         unsigned bitSize) {
+  std::string name(baseName.str());
+  if (bitSize != 32)
+    name += "*" + std::to_string(bitSize / 8);
+  return mlir::StringAttr::get(context, name);
+}
+
 mlir::LLVM::DITypeAttr
 DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
                                 mlir::LLVM::DIScopeAttr scope,
                                 fir::cg::XDeclareOp declOp) {
   mlir::MLIRContext *context = module.getContext();
   if (Ty.isInteger()) {
-    return genBasicType(context, mlir::StringAttr::get(context, "integer"),
-                        Ty.getIntOrFloatBitWidth(), llvm::dwarf::DW_ATE_signed);
+    unsigned bitWidth = Ty.getIntOrFloatBitWidth();
+    return genBasicType(context, getBasicTypeName(context, "integer", bitWidth),
+                        bitWidth, llvm::dwarf::DW_ATE_signed);
   } else if (mlir::isa<mlir::FloatType>(Ty)) {
-    return genBasicType(context, mlir::StringAttr::get(context, "real"),
-                        Ty.getIntOrFloatBitWidth(), llvm::dwarf::DW_ATE_float);
+    unsigned bitWidth = Ty.getIntOrFloatBitWidth();
+    return genBasicType(context, getBasicTypeName(context, "real", bitWidth),
+                        bitWidth, llvm::dwarf::DW_ATE_float);
   } else if (auto logTy = mlir::dyn_cast_if_present<fir::LogicalType>(Ty)) {
-    return genBasicType(context,
-                        mlir::StringAttr::get(context, logTy.getMnemonic()),
-                        kindMapping.getLogicalBitsize(logTy.getFKind()),
-                        llvm::dwarf::DW_ATE_boolean);
+    unsigned bitWidth = kindMapping.getLogicalBitsize(logTy.getFKind());
+    return genBasicType(
+        context, getBasicTypeName(context, logTy.getMnemonic(), bitWidth),
+        bitWidth, llvm::dwarf::DW_ATE_boolean);
   } else if (auto cplxTy = mlir::dyn_cast_if_present<mlir::ComplexType>(Ty)) {
     auto floatTy = mlir::cast<mlir::FloatType>(cplxTy.getElementType());
     unsigned bitWidth = floatTy.getWidth();
-    return genBasicType(context, mlir::StringAttr::get(context, "complex"),
+    return genBasicType(context, getBasicTypeName(context, "complex", bitWidth),
                         bitWidth * 2, llvm::dwarf::DW_ATE_complex_float);
   } else if (auto seqTy = mlir::dyn_cast_if_present<fir::SequenceType>(Ty)) {
     return convertSequenceType(seqTy, fileAttr, scope, declOp);
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index db030bb..e224e06 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -351,6 +351,17 @@ bool OmpStructureChecker::IsCloselyNestedRegion(const OmpDirectiveSet &set) {
   return false;
 }
 
+bool OmpStructureChecker::IsNestedInDirective(llvm::omp::Directive directive) {
+  if (dirContext_.size() >= 1) {
+    for (size_t i = dirContext_.size() - 1; i > 0; --i) {
+      if (dirContext_[i - 1].directive == directive) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 void OmpStructureChecker::CheckVariableListItem(
     const SymbolSourceMap &symbols) {
   for (auto &[symbol, source] : symbols) {
@@ -1880,13 +1891,90 @@ void OmpStructureChecker::Enter(const parser::OmpClause::At &x) {
   }
 }
 
+// Goes through the names in an OmpObjectList and checks if each name appears
+// in the given allocate statement
+void OmpStructureChecker::CheckAllNamesInAllocateStmt(
+    const parser::CharBlock &source, const parser::OmpObjectList &ompObjectList,
+    const parser::AllocateStmt &allocate) {
+  for (const auto &obj : ompObjectList.v) {
+    if (const auto *d{std::get_if<parser::Designator>(&obj.u)}) {
+      if (const auto *ref{std::get_if<parser::DataRef>(&d->u)}) {
+        if (const auto *n{std::get_if<parser::Name>(&ref->u)}) {
+          CheckNameInAllocateStmt(source, *n, allocate);
+        }
+      }
+    }
+  }
+}
+
+void OmpStructureChecker::CheckNameInAllocateStmt(
+    const parser::CharBlock &source, const parser::Name &name,
+    const parser::AllocateStmt &allocate) {
+  for (const auto &allocation :
+      std::get<std::list<parser::Allocation>>(allocate.t)) {
+    const auto &allocObj = std::get<parser::AllocateObject>(allocation.t);
+    if (const auto *n{std::get_if<parser::Name>(&allocObj.u)}) {
+      if (n->source == name.source) {
+        return;
+      }
+    }
+  }
+  unsigned version{context_.langOptions().OpenMPVersion};
+  context_.Say(source,
+      "Object '%s' in %s directive not "
+      "found in corresponding ALLOCATE statement"_err_en_US,
+      name.ToString(),
+      parser::ToUpperCaseLetters(
+          llvm::omp::getOpenMPDirectiveName(GetContext().directive, version)
+              .str()));
+}
+
 void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
-  isPredefinedAllocator = true;
   const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+
+  unsigned version{context_.langOptions().OpenMPVersion};
+  if (version >= 52) {
+    context_.Warn(common::UsageWarning::OpenMPUsage, x.source,
+        "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
+  }
+
+  bool hasAllocator = false;
+  // TODO: Investigate whether searching the clause list can be done with
+  // parser::Unwrap instead of the following loop
   const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
   for (const auto &clause : clauseList.v) {
+    if (std::get_if<parser::OmpClause::Allocator>(&clause.u)) {
+      hasAllocator = true;
+    }
+  }
+
+  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) && !hasAllocator) {
+    // TODO: expand this check to exclude the case when a requires
+    //       directive with the dynamic_allocators clause is present
+    //       in the same compilation unit (OMP5.0 2.11.3).
+    context_.Say(x.source,
+        "ALLOCATE directives that appear in a TARGET region must specify an allocator clause"_err_en_US);
+  }
+
+  const auto &allocateStmt =
+      std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement;
+  if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
+    CheckAllNamesInAllocateStmt(
+        std::get<parser::Verbatim>(x.t).source, *list, allocateStmt);
+  }
+  if (const auto &subDirs{
+          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
+              x.t)}) {
+    for (const auto &dalloc : *subDirs) {
+      CheckAllNamesInAllocateStmt(std::get<parser::Verbatim>(dalloc.t).source,
+          std::get<parser::OmpObjectList>(dalloc.t), allocateStmt);
+    }
+  }
+
+  isPredefinedAllocator = true;
+  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
+  for (const auto &clause : clauseList.v) {
     CheckAlignValue(clause);
   }
   if (objectList) {
@@ -1920,7 +2008,31 @@ void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
   const auto *allocate{
       action ? parser::Unwrap<parser::AllocateStmt>(action.stmt) : nullptr};
 
-  if (!allocate) {
+  if (allocate) {
+    for (const auto &clause : dirSpec.Clauses().v) {
+      if (auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
+        CheckAllNamesInAllocateStmt(
+            x.source, std::get<parser::OmpObjectList>(alloc->v.t), *allocate);
+
+        using OmpAllocatorSimpleModifier = parser::OmpAllocatorSimpleModifier;
+        using OmpAllocatorComplexModifier = parser::OmpAllocatorComplexModifier;
+
+        auto &modifiers{OmpGetModifiers(alloc->v)};
+        bool hasAllocator{
+            OmpGetUniqueModifier<OmpAllocatorSimpleModifier>(modifiers) ||
+            OmpGetUniqueModifier<OmpAllocatorComplexModifier>(modifiers)};
+
+        // TODO: As with allocate directive, exclude the case when a requires
+        //       directive with the dynamic_allocators clause is present in
+        //       the same compilation unit (OMP5.0 2.11.3).
+        if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) &&
+            !hasAllocator) {
+          context_.Say(x.source,
+              "ALLOCATORS directives that appear in a TARGET region must specify an allocator"_err_en_US);
+        }
+      }
+    }
+  } else {
     const parser::CharBlock &source = action ? action.source : x.source;
     context_.Say(source,
         "The body of the ALLOCATORS construct should be an ALLOCATE statement"_err_en_US);
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 1937845..f507278 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -177,6 +177,7 @@ private:
   bool HasInvalidWorksharingNesting(
       const parser::CharBlock &, const OmpDirectiveSet &);
   bool IsCloselyNestedRegion(const OmpDirectiveSet &set);
+  bool IsNestedInDirective(llvm::omp::Directive directive);
   void HasInvalidTeamsNesting(
       const llvm::omp::Directive &dir, const parser::CharBlock &source);
   void HasInvalidDistributeNesting(const parser::OpenMPLoopConstruct &x);
@@ -309,6 +310,11 @@ private:
       const std::optional<parser::OmpClauseList> &maybeClauses);
   void CheckCancellationNest(
       const parser::CharBlock &source, llvm::omp::Directive type);
+  void CheckAllNamesInAllocateStmt(const parser::CharBlock &source,
+      const parser::OmpObjectList &ompObjectList,
+      const parser::AllocateStmt &allocate);
+  void CheckNameInAllocateStmt(const parser::CharBlock &source,
+      const parser::Name &ompObject, const parser::AllocateStmt &allocate);
   std::int64_t GetOrdCollapseLevel(const parser::OpenMPLoopConstruct &x);
   void CheckReductionObjects(
       const parser::OmpObjectList &objects, llvm::omp::Clause clauseId);
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 270642a..bd7b8ac 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -149,7 +149,24 @@ protected:
     dataSharingAttributeObjects_.clear();
   }
   bool HasDataSharingAttributeObject(const Symbol &);
+
+  /// Extract the iv and bounds of a DO loop:
+  /// 1. The loop index/induction variable
+  /// 2. The lower bound
+  /// 3. The upper bound
+  /// 4. The step/increment (or nullptr if not present)
+  ///
+  /// Each returned tuple value can be nullptr if not present. Diagnoses an
+  /// error if the the DO loop is a DO WHILE or DO CONCURRENT loop.
+  std::tuple<const parser::Name *, const parser::ScalarExpr *,
+      const parser::ScalarExpr *, const parser::ScalarExpr *>
+  GetLoopBounds(const parser::DoConstruct &);
+
+  /// Extract the loop index/induction variable from a DO loop. Diagnoses an
+  /// error if the the DO loop is a DO WHILE or DO CONCURRENT loop and returns
+  /// nullptr.
   const parser::Name *GetLoopIndex(const parser::DoConstruct &);
+
   const parser::DoConstruct *GetDoConstructIf(
       const parser::ExecutionPartConstruct &);
   Symbol *DeclareNewAccessEntity(const Symbol &, Symbol::Flag, Scope &);
@@ -953,6 +970,13 @@ private:
     privateDataSharingAttributeObjects_.clear();
   }
 
+  /// Check that loops in the loop nest are perfectly nested, as well that lower
+  /// bound, upper bound, and step expressions do not use the iv
+  /// of a surrounding loop of the associated loops nest.
+  /// We do not support non-perfectly nested loops not non-rectangular loops yet
+  /// (both introduced in OpenMP 5.0)
+  void CheckPerfectNestAndRectangularLoop(const parser::OpenMPLoopConstruct &x);
+
   // Predetermined DSA rules
   void PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
       const parser::OpenMPLoopConstruct &);
@@ -987,11 +1011,6 @@ private:
     sourceLabels_.clear();
     targetLabels_.clear();
   };
-  void CheckAllNamesInAllocateStmt(const parser::CharBlock &source,
-      const parser::OmpObjectList &ompObjectList,
-      const parser::AllocateStmt &allocate);
-  void CheckNameInAllocateStmt(const parser::CharBlock &source,
-      const parser::Name &ompObject, const parser::AllocateStmt &allocate);
 
   std::int64_t ordCollapseLevel{0};
 
@@ -1028,14 +1047,15 @@ bool DirectiveAttributeVisitor<T>::HasDataSharingAttributeObject(
 }
 
 template <typename T>
-const parser::Name *DirectiveAttributeVisitor<T>::GetLoopIndex(
-    const parser::DoConstruct &x) {
+std::tuple<const parser::Name *, const parser::ScalarExpr *,
+    const parser::ScalarExpr *, const parser::ScalarExpr *>
+DirectiveAttributeVisitor<T>::GetLoopBounds(const parser::DoConstruct &x) {
   using Bounds = parser::LoopControl::Bounds;
   if (x.GetLoopControl()) {
     if (const Bounds * b{std::get_if<Bounds>(&x.GetLoopControl()->u)}) {
-      return &b->name.thing;
-    } else {
-      return nullptr;
+      auto &step = b->step;
+      return {&b->name.thing, &b->lower, &b->upper,
+          step.has_value() ? &step.value() : nullptr};
     }
   } else {
     context_
@@ -1043,8 +1063,14 @@ const parser::Name *DirectiveAttributeVisitor<T>::GetLoopIndex(
             "Loop control is not present in the DO LOOP"_err_en_US)
         .Attach(GetContext().directiveSource,
             "associated with the enclosing LOOP construct"_en_US);
-    return nullptr;
   }
+  return {nullptr, nullptr, nullptr, nullptr};
+}
+
+template <typename T>
+const parser::Name *DirectiveAttributeVisitor<T>::GetLoopIndex(
+    const parser::DoConstruct &x) {
+  return std::get<const parser::Name *>(GetLoopBounds(x));
 }
 
 template <typename T>
@@ -1990,6 +2016,10 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
       }
     }
   }
+
+  // Must be done before iv privatization
+  CheckPerfectNestAndRectangularLoop(x);
+
   PrivatizeAssociatedLoopIndexAndCheckLoopLevel(x);
   ordCollapseLevel = GetNumAffectedLoopsFromLoopConstruct(x) + 1;
   return true;
@@ -2185,6 +2215,116 @@ void OmpAttributeVisitor::CollectNumAffectedLoopsFromClauses(
   }
 }
 
+void OmpAttributeVisitor::CheckPerfectNestAndRectangularLoop(
+    const parser::OpenMPLoopConstruct &x) {
+  auto &dirContext{GetContext()};
+  std::int64_t dirDepth{dirContext.associatedLoopLevel};
+  if (dirDepth <= 0)
+    return;
+
+  auto checkExprHasSymbols = [&](llvm::SmallVector<Symbol *> &ivs,
+                                 const parser::ScalarExpr *bound) {
+    if (ivs.empty())
+      return;
+    auto boundExpr{semantics::AnalyzeExpr(context_, *bound)};
+    if (!boundExpr)
+      return;
+    semantics::UnorderedSymbolSet boundSyms{
+        evaluate::CollectSymbols(*boundExpr)};
+    if (boundSyms.empty())
+      return;
+    for (Symbol *iv : ivs) {
+      if (boundSyms.count(*iv) != 0) {
+        // TODO: Point to occurence of iv in boundExpr, directiveSource as a
+        //       note
+        context_.Say(dirContext.directiveSource,
+            "Trip count must be computable and invariant"_err_en_US);
+      }
+    }
+  };
+
+  // Find the associated region by skipping nested loop-associated constructs
+  // such as loop transformations
+  const parser::NestedConstruct *innermostAssocRegion{nullptr};
+  const parser::OpenMPLoopConstruct *innermostConstruct{&x};
+  while (const auto &innerAssocStmt{
+      std::get<std::optional<parser::NestedConstruct>>(
+          innermostConstruct->t)}) {
+    innermostAssocRegion = &(innerAssocStmt.value());
+    if (const auto *innerConstruct{
+            std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
+                innermostAssocRegion)}) {
+      innermostConstruct = &innerConstruct->value();
+    } else {
+      break;
+    }
+  }
+
+  if (!innermostAssocRegion)
+    return;
+  const auto &outer{std::get_if<parser::DoConstruct>(innermostAssocRegion)};
+  if (!outer)
+    return;
+
+  llvm::SmallVector<Symbol *> ivs;
+  int curLevel{0};
+  const parser::DoConstruct *loop{outer};
+  while (true) {
+    auto [iv, lb, ub, step] = GetLoopBounds(*loop);
+
+    if (lb)
+      checkExprHasSymbols(ivs, lb);
+    if (ub)
+      checkExprHasSymbols(ivs, ub);
+    if (step)
+      checkExprHasSymbols(ivs, step);
+    if (iv) {
+      if (auto *symbol{currScope().FindSymbol(iv->source)})
+        ivs.push_back(symbol);
+    }
+
+    // Stop after processing all affected loops
+    if (curLevel + 1 >= dirDepth)
+      break;
+
+    // Recurse into nested loop
+    const auto &block{std::get<parser::Block>(loop->t)};
+    if (block.empty()) {
+      // Insufficient number of nested loops already reported by
+      // CheckAssocLoopLevel()
+      break;
+    }
+
+    loop = GetDoConstructIf(block.front());
+    if (!loop) {
+      // Insufficient number of nested loops already reported by
+      // CheckAssocLoopLevel()
+      break;
+    }
+
+    auto checkPerfectNest = [&, this]() {
+      auto blockSize = block.size();
+      if (blockSize <= 1)
+        return;
+
+      if (parser::Unwrap<parser::ContinueStmt>(x))
+        blockSize -= 1;
+
+      if (blockSize <= 1)
+        return;
+
+      // Non-perfectly nested loop
+      // TODO: Point to non-DO statement, directiveSource as a note
+      context_.Say(dirContext.directiveSource,
+          "Canonical loop nest must be perfectly nested."_err_en_US);
+    };
+
+    checkPerfectNest();
+
+    ++curLevel;
+  }
+}
+
 // 2.15.1.1 Data-sharing Attribute Rules - Predetermined
 //   - The loop iteration variable(s) in the associated do-loop(s) of a do,
 //     parallel do, taskloop, or distribute construct is (are) private.
@@ -2405,8 +2545,6 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) {
 }
 
 bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) {
-  IssueNonConformanceWarning(llvm::omp::Directive::OMPD_allocate, x.source, 52);
-
   PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
   const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)};
   if (list) {
@@ -2487,83 +2625,10 @@ bool OmpAttributeVisitor::IsNestedInDirective(llvm::omp::Directive directive) {
 }
 
 void OmpAttributeVisitor::Post(const parser::OpenMPExecutableAllocate &x) {
-  bool hasAllocator = false;
-  // TODO: Investigate whether searching the clause list can be done with
-  // parser::Unwrap instead of the following loop
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-  for (const auto &clause : clauseList.v) {
-    if (std::get_if<parser::OmpClause::Allocator>(&clause.u)) {
-      hasAllocator = true;
-    }
-  }
-
-  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) && !hasAllocator) {
-    // TODO: expand this check to exclude the case when a requires
-    //       directive with the dynamic_allocators clause is present
-    //       in the same compilation unit (OMP5.0 2.11.3).
-    context_.Say(x.source,
-        "ALLOCATE directives that appear in a TARGET region "
-        "must specify an allocator clause"_err_en_US);
-  }
-
-  const auto &allocateStmt =
-      std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement;
-  if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
-    CheckAllNamesInAllocateStmt(
-        std::get<parser::Verbatim>(x.t).source, *list, allocateStmt);
-  }
-  if (const auto &subDirs{
-          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-              x.t)}) {
-    for (const auto &dalloc : *subDirs) {
-      CheckAllNamesInAllocateStmt(std::get<parser::Verbatim>(dalloc.t).source,
-          std::get<parser::OmpObjectList>(dalloc.t), allocateStmt);
-    }
-  }
   PopContext();
 }
 
 void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) {
-  const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()};
-  auto &block{std::get<parser::Block>(x.t)};
-
-  omp::SourcedActionStmt action{omp::GetActionStmt(block)};
-  const parser::AllocateStmt *allocate{[&]() {
-    if (action) {
-      if (auto *alloc{std::get_if<common::Indirection<parser::AllocateStmt>>(
-              &action.stmt->u)}) {
-        return &alloc->value();
-      }
-    }
-    return static_cast<const parser::AllocateStmt *>(nullptr);
-  }()};
-
-  if (allocate) {
-    for (const auto &clause : dirSpec.Clauses().v) {
-      if (auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
-        CheckAllNamesInAllocateStmt(
-            x.source, std::get<parser::OmpObjectList>(alloc->v.t), *allocate);
-
-        using OmpAllocatorSimpleModifier = parser::OmpAllocatorSimpleModifier;
-        using OmpAllocatorComplexModifier = parser::OmpAllocatorComplexModifier;
-
-        auto &modifiers{OmpGetModifiers(alloc->v)};
-        bool hasAllocator{
-            OmpGetUniqueModifier<OmpAllocatorSimpleModifier>(modifiers) ||
-            OmpGetUniqueModifier<OmpAllocatorComplexModifier>(modifiers)};
-
-        // TODO: As with allocate directive, exclude the case when a requires
-        //       directive with the dynamic_allocators clause is present in
-        //       the same compilation unit (OMP5.0 2.11.3).
-        if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) &&
-            !hasAllocator) {
-          context_.Say(x.source,
-              "ALLOCATORS directives that appear in a TARGET region "
-              "must specify an allocator"_err_en_US);
-        }
-      }
-    }
-  }
   PopContext();
 }
 
@@ -3483,44 +3548,6 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
   }
 }
 
-// Goes through the names in an OmpObjectList and checks if each name appears
-// in the given allocate statement
-void OmpAttributeVisitor::CheckAllNamesInAllocateStmt(
-    const parser::CharBlock &source, const parser::OmpObjectList &ompObjectList,
-    const parser::AllocateStmt &allocate) {
-  for (const auto &obj : ompObjectList.v) {
-    if (const auto *d{std::get_if<parser::Designator>(&obj.u)}) {
-      if (const auto *ref{std::get_if<parser::DataRef>(&d->u)}) {
-        if (const auto *n{std::get_if<parser::Name>(&ref->u)}) {
-          CheckNameInAllocateStmt(source, *n, allocate);
-        }
-      }
-    }
-  }
-}
-
-void OmpAttributeVisitor::CheckNameInAllocateStmt(
-    const parser::CharBlock &source, const parser::Name &name,
-    const parser::AllocateStmt &allocate) {
-  for (const auto &allocation :
-      std::get<std::list<parser::Allocation>>(allocate.t)) {
-    const auto &allocObj = std::get<parser::AllocateObject>(allocation.t);
-    if (const auto *n{std::get_if<parser::Name>(&allocObj.u)}) {
-      if (n->source == name.source) {
-        return;
-      }
-    }
-  }
-  unsigned version{context_.langOptions().OpenMPVersion};
-  context_.Say(source,
-      "Object '%s' in %s directive not "
-      "found in corresponding ALLOCATE statement"_err_en_US,
-      name.ToString(),
-      parser::ToUpperCaseLetters(
-          llvm::omp::getOpenMPDirectiveName(GetContext().directive, version)
-              .str()));
-}
-
 void OmpAttributeVisitor::AddOmpRequiresToScope(Scope &scope,
     WithOmpDeclarative::RequiresFlags flags,
     std::optional<common::OmpMemoryOrderType> memOrder) {
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
deleted file mode 100644
index 258a1d8..0000000
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
+++ /dev/null
@@ -1,345 +0,0 @@
-// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
-
-// Simplify should reduce hlfir.index to constant (5)
-func.func @_QPt1() {
-// CHECK-LABEL:   func.func @_QPt1() {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 5 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 4 : index
-// CHECK:           %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_3]] {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
-// CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
-// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_2]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
-// CHECK:           hlfir.assign %[[VAL_10]]#0 to %[[VAL_8]]#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
-// CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
-// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
-// CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_0]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_6]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %c4 = arith.constant 4 : index
-    %3 = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
-    %4:2 = hlfir.declare %3 typeparams %c4 {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
-    %5 = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
-    %c3 = arith.constant 3 : index
-    %6:2 = hlfir.declare %5 typeparams %c3 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
-    hlfir.assign %6#0 to %4#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
-    %7 = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
-    %c0 = arith.constant 0 : index
-    %8:2 = hlfir.declare %7 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
-    %true = arith.constant true
-    %9 = hlfir.index %8#0 in %4#0 back %true : (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,4>>, i1) -> i32
-    hlfir.assign %9 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-
-// ! 'back' is unknown at compile time, substring is zero length - generate select (back ? strlen+1 : 1)
-func.func @_QPt2(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
-// CHECK-LABEL:   func.func @_QPt2(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"},
-// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
-// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-// CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
-// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
-// CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_0]] : index
-// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<4>) -> i1
-// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_0]] : index
-// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_14]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
-    %3:2 = hlfir.declare %2 {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-    %6 = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
-    %c0 = arith.constant 0 : index
-    %7:2 = hlfir.declare %6 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
-    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
-    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<2,0>>, !fir.boxchar<2>, !fir.logical<4>) -> i32
-    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
-    return
-}
-
-// inline as search loop (backward)
-func.func @_QPt3(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt3(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
-// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
-// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
-// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
-// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
-// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i16
-// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
-// CHECK:               fir.result %[[VAL_23]] : index
-// CHECK:             } else {
-// CHECK:               fir.result %[[VAL_15]] : index
-// CHECK:             }
-// CHECK:             fir.result %[[VAL_17]] : index
-// CHECK:           }
-// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-    %true = arith.constant true
-    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
-    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-
-//inline as search loop (forward)
-func.func @_QPt4(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt4(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
-// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
-// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
-// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
-// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
-// CHECK:               fir.result %[[VAL_21]] : index
-// CHECK:             } else {
-// CHECK:               fir.result %[[VAL_14]] : index
-// CHECK:             }
-// CHECK:             fir.result %[[VAL_16]] : index
-// CHECK:           }
-// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_22]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-    %false = arith.constant false
-    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
-    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-
-// Same as t4 above but result kind=1
-func.func @_QPt5(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt5(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
-// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
-// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
-// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
-// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
-// CHECK:               fir.result %[[VAL_21]] : index
-// CHECK:             } else {
-// CHECK:               fir.result %[[VAL_14]] : index
-// CHECK:             }
-// CHECK:             fir.result %[[VAL_16]] : index
-// CHECK:           }
-// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i8
-// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i8) -> i32
-// CHECK:           hlfir.assign %[[VAL_23]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-    %false = arith.constant false
-    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i8
-    %8 = fir.convert %7 : (i8) -> i32
-    hlfir.assign %8 to %2#0 : i32, !fir.ref<i32>
-    return
-  }
-
-// Do no simplify - runtime call for forward search with character kind=1 is faster
-func.func @_QPt6(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt6(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant false
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_0]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
-// CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-    %false = arith.constant false
-    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
-    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-
-// Do not simplify - runtime call for forward search with character kind=1 is faster
-// Lookup direction is unknown at compile time, hence forward is pessimistically assumed
-func.func @_QPt7(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
-// CHECK-LABEL:   func.func @_QPt7(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"},
-// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_0]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>>
-// CHECK:           %[[VAL_10:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_9]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
-// CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
-    %3:2 = hlfir.declare %2 {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    %6 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-    %c1 = arith.constant 1 : index
-    %7:2 = hlfir.declare %6 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
-    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
-    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
-    return
-}
-
-// Inline as backward search loop for character kind=1.
-// The case similar to t7 but direction is known, so it is faster than runtime call.
-func.func @_QPt8(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt8(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<1>>, index, index, index) -> !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<1>) -> i8
-// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
-// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
-// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
-// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
-// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
-// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<1>, index, index, index) -> !fir.ref<!fir.char<1>>
-// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<1>>
-// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<1>) -> i8
-// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i8
-// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
-// CHECK:               fir.result %[[VAL_23]] : index
-// CHECK:             } else {
-// CHECK:               fir.result %[[VAL_15]] : index
-// CHECK:             }
-// CHECK:             fir.result %[[VAL_17]] : index
-// CHECK:           }
-// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-    %true = arith.constant true
-    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
-    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-
diff --git a/flang/test/Integration/debug-complex-1.f90 b/flang/test/Integration/debug-complex-1.f90
index 1ec4b7f..48ea029 100644
--- a/flang/test/Integration/debug-complex-1.f90
+++ b/flang/test/Integration/debug-complex-1.f90
@@ -17,8 +17,8 @@ contains
 end program
 
 ! CHECK-DAG: ![[C4:.*]] = !DIBasicType(name: "complex", size: 64, encoding: DW_ATE_complex_float)
-! CHECK-DAG: ![[C8:.*]] = !DIBasicType(name: "complex", size: 128, encoding: DW_ATE_complex_float)
-! CHECK-DAG: ![[C16:.*]] = !DIBasicType(name: "complex", size: 256, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C8:.*]] = !DIBasicType(name: "complex*8", size: 128, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C16:.*]] = !DIBasicType(name: "complex*16", size: 256, encoding: DW_ATE_complex_float)
 ! CHECK-DAG: !DILocalVariable(name: "c4"{{.*}}type: ![[C4]])
 ! CHECK-DAG: !DILocalVariable(name: "c8"{{.*}}type: ![[C8]])
 ! CHECK-DAG: !DILocalVariable(name: "r"{{.*}}type: ![[C16]])
diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90
index 0ddac63..93659a5 100644
--- a/flang/test/Integration/debug-local-var-2.f90
+++ b/flang/test/Integration/debug-local-var-2.f90
@@ -40,11 +40,11 @@ program mn
 ! BOTH-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "MN", {{.*}})
 
 ! BOTH-DAG: ![[TYI32:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
-! BOTH-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer", size: 64, encoding: DW_ATE_signed)
-! BOTH-DAG: ![[TYL8:.*]]  = !DIBasicType(name: "logical", size: 8, encoding: DW_ATE_boolean)
+! BOTH-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer*8", size: 64, encoding: DW_ATE_signed)
+! BOTH-DAG: ![[TYL8:.*]]  = !DIBasicType(name: "logical*1", size: 8, encoding: DW_ATE_boolean)
 ! BOTH-DAG: ![[TYL32:.*]] = !DIBasicType(name: "logical", size: 32, encoding: DW_ATE_boolean)
 ! BOTH-DAG: ![[TYR32:.*]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
-! BOTH-DAG: ![[TYR64:.*]] = !DIBasicType(name: "real", size: 64, encoding: DW_ATE_float)
+! BOTH-DAG: ![[TYR64:.*]] = !DIBasicType(name: "real*8", size: 64, encoding: DW_ATE_float)
 
 ! BOTH-DAG: ![[I4]] = !DILocalVariable(name: "i4", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI32]])
 ! BOTH-DAG: ![[I8]] = !DILocalVariable(name: "i8", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI64]])
diff --git a/flang/test/Integration/debug-split-dwarf.f90 b/flang/test/Integration/debug-split-dwarf.f90
index 60373ef..ebfa040 100644
--- a/flang/test/Integration/debug-split-dwarf.f90
+++ b/flang/test/Integration/debug-split-dwarf.f90
@@ -2,20 +2,28 @@
 
 ! Testing to ensure that setting only -split-dwarf-file allows to place
 ! .dwo sections into regular output object.
-!  RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
-!  RUN:   -split-dwarf-file %t.o -emit-obj -o %t.o %s
-!  RUN: llvm-readobj -S %t.o | FileCheck --check-prefix=DWO %s
+! RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
+! RUN:   -split-dwarf-file %t.o -emit-obj -o %t.o %s
+! RUN: llvm-readobj -S %t.o | FileCheck --check-prefix=DWO %s
 
 ! Testing to ensure that setting both -split-dwarf-file and -split-dwarf-output
 ! does not place .dwo sections into regular output object but in a separate
 ! file.
-!  RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
-!  RUN:   -split-dwarf-file %t.dwo -split-dwarf-output %t.dwo -emit-obj -o %t.o %s
-!  RUN: llvm-readobj -S %t.dwo | FileCheck --check-prefix=DWO %s
-!  RUN: llvm-readobj -S %t.o | FileCheck --check-prefix=SPLIT %s
+! RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
+! RUN:   -split-dwarf-file %t.dwo -split-dwarf-output %t.dwo -emit-obj -o %t.o %s
+! RUN: llvm-readobj -S %t.dwo | FileCheck --check-prefix=DWO %s
+! RUN: llvm-readobj -S %t.o | FileCheck --check-prefix=SPLIT %s
 
-!  DWO: .dwo
-!  SPLIT-NOT: .dwo
+! Test that splitDebugFilename field of the DICompileUnit get correctly
+! generated.
+! RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
+! RUN:   -split-dwarf-file %t.test_dwo -split-dwarf-output %t.test_dwo \
+! RUN:   -emit-llvm %s -o - | FileCheck --check-prefix=CU %s
+
+! DWO: .dwo
+! SPLIT-NOT: .dwo
+! CU: !DICompileUnit
+! CU-SAME: splitDebugFilename: "{{.*}}test_dwo"
 
 program test
 end program test
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
index 508efa82..4967330 100644
--- a/flang/test/Semantics/OpenMP/allocate-align01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -13,7 +13,7 @@ program allocate_align_tree
     z = 3
     !ERROR: The alignment value should be a constant positive integer
 !$omp allocate(j) align(xx)
-    !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead. [-Wopen-mp-usage]
+    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !ERROR: The alignment value should be a constant positive integer
 !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc)
     allocate(j(z), xarray(t))
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index 5280d1b..1d99811 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -19,7 +19,7 @@ use omp_lib
     !$omp allocate(y)
         print *, a
 
-    !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead. [-Wopen-mp-usage]
+    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !$omp allocate(x) allocator(omp_default_mem_alloc)
       allocate ( x(a), darray(a, b) )
     end subroutine sema
diff --git a/flang/test/Semantics/OpenMP/do08.f90 b/flang/test/Semantics/OpenMP/do08.f90
index 5143dff..bb3c1d0c 100644
--- a/flang/test/Semantics/OpenMP/do08.f90
+++ b/flang/test/Semantics/OpenMP/do08.f90
@@ -61,6 +61,7 @@ program omp
   !$omp end do
 
 
+  !ERROR: Canonical loop nest must be perfectly nested.
   !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
   !$omp do collapse(3)
   do 60 i=2,200,2
diff --git a/flang/test/Semantics/OpenMP/do13.f90 b/flang/test/Semantics/OpenMP/do13.f90
index 6e9d1dd..8f7844f 100644
--- a/flang/test/Semantics/OpenMP/do13.f90
+++ b/flang/test/Semantics/OpenMP/do13.f90
@@ -59,6 +59,7 @@ program omp
   !$omp end do
 
 
+  !ERROR: Canonical loop nest must be perfectly nested.
   !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
   !$omp do collapse(3)
   do 60 i=1,10
diff --git a/flang/test/Semantics/OpenMP/do22.f90 b/flang/test/Semantics/OpenMP/do22.f90
new file mode 100644
index 0000000..9d96d3a
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/do22.f90
@@ -0,0 +1,73 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! Check for existence of loop following a DO directive
+
+subroutine do_imperfectly_nested_before
+  integer i, j
+
+  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !$omp do collapse(2)
+  do i = 1, 10
+    print *, i
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_imperfectly_nested_behind
+  integer i, j
+
+  !ERROR: Canonical loop nest must be perfectly nested.
+  !$omp do collapse(2)
+  do i = 1, 10
+    do j = 1, 10
+      print *, i, j
+    end do
+    print *, i
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_nonrectangular_lb
+  integer i, j
+
+  !ERROR: Trip count must be computable and invariant
+  !$omp do collapse(2)
+  do i = 1, 10
+    do j = i, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_nonrectangular_ub
+  integer i, j
+
+  !ERROR: Trip count must be computable and invariant
+  !$omp do collapse(2)
+  do i = 1, 10
+    do j = 0, i
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_nonrectangular_step
+  integer i, j
+
+  !ERROR: Trip count must be computable and invariant
+  !$omp do collapse(2)
+  do i = 1, 10
+    do j = 1, 10, i
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
diff --git a/flang/test/Transforms/debug-complex-1.fir b/flang/test/Transforms/debug-complex-1.fir
index f7be6b2..7a288fe 100644
--- a/flang/test/Transforms/debug-complex-1.fir
+++ b/flang/test/Transforms/debug-complex-1.fir
@@ -26,9 +26,9 @@ module {
 #loc3 = loc("./simple.f90":8:1)
 #loc4 = loc("./simple.f90":11:1)
 
-// CHECK-DAG: #[[CMPX8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 128, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex*8", sizeInBits = 128, encoding = DW_ATE_complex_float>
 // CHECK-DAG: #[[CMPX4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 64, encoding = DW_ATE_complex_float>
-// CHECK-DAG: #[[CMPX16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 256, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex*16", sizeInBits = 256, encoding = DW_ATE_complex_float>
 
 // CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX8]], #[[CMPX4]]>
 // CHECK-DAG: #[[TY2:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX16]], #[[CMPX4]]>
diff --git a/flang/test/Transforms/debug-derived-type-1.fir b/flang/test/Transforms/debug-derived-type-1.fir
index cfbd361..672b6cf 100644
--- a/flang/test/Transforms/debug-derived-type-1.fir
+++ b/flang/test/Transforms/debug-derived-type-1.fir
@@ -45,12 +45,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 
 
 // CHECK-DAG: #[[INT_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-// CHECK-DAG: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
 // CHECK-DAG: #[[REAL4_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #[[CMX8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 64, encoding = DW_ATE_complex_float>
 // CHECK-DAG: #[[CMX_ARR:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type, baseType = #[[CMX8_TY:.*]]{{.*}}>
-// CHECK-DAG: #[[LOG_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
-// CHECK-DAG: #[[REAL8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[LOG_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical*1", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
 // CHECK-DAG: #[[STR_TY:.*]] = #llvm.di_string_type
 // CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}name = "m_employee"{{.*}}>
 // CHECK-DAG: #[[MOD1:.*]] = #llvm.di_module<{{.*}}name = "t1"{{.*}}>
diff --git a/flang/test/Transforms/debug-fn-info.fir b/flang/test/Transforms/debug-fn-info.fir
index c02835b..d82cef1 100644
--- a/flang/test/Transforms/debug-fn-info.fir
+++ b/flang/test/Transforms/debug-fn-info.fir
@@ -64,10 +64,10 @@ module {
 #loc4 = loc("test2.f90":53:22)
 
 
-// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
 // CHECK-DAG: #[[INT4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
-// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical*1", sizeInBits = 8, encoding = DW_ATE_boolean>
 // CHECK-DAG: #[[REAL4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #[[LOG4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 32, encoding = DW_ATE_boolean>
 // CHECK: #[[TY0:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_program, types = #di_null_type>
diff --git a/flang/test/Transforms/debug-local-var.fir b/flang/test/Transforms/debug-local-var.fir
index 06c9b01e..466f79c 100644
--- a/flang/test/Transforms/debug-local-var.fir
+++ b/flang/test/Transforms/debug-local-var.fir
@@ -71,10 +71,10 @@ module {
 #loc15 = loc("test.f90":21:24)
 #loc16 = loc("test.f90":22:5)
 
-// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
 // CHECK-DAG: #[[INT4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
-// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical*1", sizeInBits = 8, encoding = DW_ATE_boolean>
 // CHECK-DAG: #[[REAL4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #[[LOG4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 32, encoding = DW_ATE_boolean>
 // CHECK-DAG: #[[MAIN:.*]] = #llvm.di_subprogram<{{.*}}name = "mn"{{.*}}>
diff --git a/flang/test/Transforms/debug-ref-type.fir b/flang/test/Transforms/debug-ref-type.fir
index 745aebe..2164a40 100644
--- a/flang/test/Transforms/debug-ref-type.fir
+++ b/flang/test/Transforms/debug-ref-type.fir
@@ -5,6 +5,6 @@ module {
 }
 #loc1 = loc("test.f90":5:1)
 
-// CHECK: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 8, encoding = DW_ATE_signed>
+// CHECK: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*1", sizeInBits = 8, encoding = DW_ATE_signed>
 // CHECK: #[[REF_TY:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, name = "", baseType = #[[INT8_TY]]{{.*}}>
 // CHECK: #llvm.di_subroutine_type<{{.*}}types = #[[REF_TY]], #[[INT8_TY]]>
diff --git a/flang/test/Transforms/debug-split-dwarf.fir b/flang/test/Transforms/debug-split-dwarf.fir
new file mode 100644
index 0000000..9c09545
--- /dev/null
+++ b/flang/test/Transforms/debug-split-dwarf.fir
@@ -0,0 +1,12 @@
+// RUN: fir-opt --add-debug-info="split-dwarf-file=test.dwo"  \
+// RUN:         --mlir-print-debuginfo %s -o - | FileCheck %s
+
+module {
+  func.func @test() {
+    return
+  } loc(#loc1)
+}
+#loc1 = loc("test.f90":15:1)
+
+// CHECK: llvm.di_compile_unit
+// CHECK-SAME: splitDebugFilename = "test.dwo"
diff --git a/flang/test/Transforms/debug-tuple-type.fir b/flang/test/Transforms/debug-tuple-type.fir
index e3b0baf..b865d49 100644
--- a/flang/test/Transforms/debug-tuple-type.fir
+++ b/flang/test/Transforms/debug-tuple-type.fir
@@ -5,7 +5,7 @@ module {
   func.func private @_FortranAioOutputDerivedType(!fir.ref<tuple<>>)
 }
 
-// CHECK: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
 // CHECK: #[[CU:.*]] = #llvm.di_compile_unit<{{.*}}>
 // CHECK: #[[DTY1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "", baseType = #[[F64]], sizeInBits = 64, alignInBits = {{.*}}>
 // CHECK: #[[DTY2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "", baseType = #[[F64]], sizeInBits = 64, alignInBits = {{.*}}, offsetInBits = {{.*}}>
diff --git a/flang/test/Transforms/debug-vector-type.fir b/flang/test/Transforms/debug-vector-type.fir
index d3e1f6e..cfb97ea 100644
--- a/flang/test/Transforms/debug-vector-type.fir
+++ b/flang/test/Transforms/debug-vector-type.fir
@@ -2,22 +2,22 @@
 
 module {
 func.func private @foo1(%arg0: !fir.vector<20:bf16>)
-// CHECK-DAG: #[[F16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 16, encoding = DW_ATE_float>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real (20)", baseType = #[[F16]], flags = Vector, sizeInBits = 320, elements = #llvm.di_subrange<count = 20 : i64>>
+// CHECK-DAG: #[[F16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*2", sizeInBits = 16, encoding = DW_ATE_float>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real*2 (20)", baseType = #[[F16]], flags = Vector, sizeInBits = 320, elements = #llvm.di_subrange<count = 20 : i64>>
 
 func.func private @foo2(%arg0: !fir.vector<30:f32>)
 // CHECK-DAG: #[[F32:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real (30)", baseType = #[[F32]], flags = Vector, sizeInBits = 960, elements = #llvm.di_subrange<count = 30 : i64>>
 
 func.func private @foo3(%arg0: !fir.vector<10:f64>)
-// CHECK-DAG: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real (10)", baseType = #[[F64]], flags = Vector, sizeInBits = 640, elements = #llvm.di_subrange<count = 10 : i64>>
+// CHECK-DAG: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real*8 (10)", baseType = #[[F64]], flags = Vector, sizeInBits = 640, elements = #llvm.di_subrange<count = 10 : i64>>
 
 func.func private @foo4(%arg0: !fir.vector<5:i32>)
 // CHECK-DAG: #[[I32:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
 // CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector integer (5)", baseType = #[[I32]], flags = Vector, sizeInBits = 160, elements = #llvm.di_subrange<count = 5 : i64>>
 
 func.func private @foo5(%arg0: !fir.vector<2:i64>)
-// CHECK-DAG: #[[I64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector integer (2)", baseType = #[[I64]], flags = Vector, sizeInBits = 128, elements = #llvm.di_subrange<count = 2 : i64>>
+// CHECK-DAG: #[[I64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector integer*8 (2)", baseType = #[[I64]], flags = Vector, sizeInBits = 128, elements = #llvm.di_subrange<count = 2 : i64>>
 }
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 87d86c134..8d023a1 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -41,6 +41,7 @@ Implemented Papers
 - P2321R2: ``zip`` (`Github <https://llvm.org/PR105169>`__) (The paper is partially implemented. ``zip_transform_view``
   is implemented in this release)
 - P3044R2: sub-``string_view`` from ``string`` (`Github <https://llvm.org/PR148140>`__)
+- P3223R2: Making ``std::istream::ignore`` less surprising (`Github <https://llvm.org/PR148178>`__)
 - P3168R2: Give ``std::optional`` Range Support (`Github <https://llvm.org/PR105430>`__)
 
 Improvements and New Features
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 9e1678f..4e0918b 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -151,7 +151,7 @@
 "`P3111R8 <https://wg21.link/P3111R8>`__","Atomic Reduction Operations","2025-06 (Sofia)","","","`#148174 <https://github.com/llvm/llvm-project/issues/148174>`__",""
 "`P3060R3 <https://wg21.link/P3060R3>`__","Add ``std::views::indices(n)``","2025-06 (Sofia)","","","`#148175 <https://github.com/llvm/llvm-project/issues/148175>`__",""
 "`P2319R5 <https://wg21.link/P2319R5>`__","Prevent ``path`` presentation problems","2025-06 (Sofia)","","","`#148177 <https://github.com/llvm/llvm-project/issues/148177>`__",""
-"`P3223R2 <https://wg21.link/P3223R2>`__","Making ``std::istream::ignore`` less surprising","2025-06 (Sofia)","","","`#148178 <https://github.com/llvm/llvm-project/issues/148178>`__",""
+"`P3223R2 <https://wg21.link/P3223R2>`__","Making ``std::istream::ignore`` less surprising","2025-06 (Sofia)","|Complete|","22","`#148178 <https://github.com/llvm/llvm-project/issues/148178>`__",""
 "`P2781R9 <https://wg21.link/P2781R9>`__","``std::constant_wrapper``","2025-06 (Sofia)","","","`#148179 <https://github.com/llvm/llvm-project/issues/148179>`__",""
 "`P3697R1 <https://wg21.link/P3697R1>`__","Minor additions to C++26 standard library hardening","2025-06 (Sofia)","","","`#148180 <https://github.com/llvm/llvm-project/issues/148180>`__",""
 "`P3552R3 <https://wg21.link/P3552R3>`__","Add a Coroutine Task Type","2025-06 (Sofia)","","","`#148182 <https://github.com/llvm/llvm-project/issues/148182>`__",""
diff --git a/libcxx/include/__utility/default_three_way_comparator.h b/libcxx/include/__utility/default_three_way_comparator.h
index 438ab55..92cdce6 100644
--- a/libcxx/include/__utility/default_three_way_comparator.h
+++ b/libcxx/include/__utility/default_three_way_comparator.h
@@ -40,13 +40,13 @@ struct __default_three_way_comparator<_LHS,
   }
 };
 
-#if _LIBCPP_STD_VER >= 20 && __has_builtin(__builtin_lt_synthesises_from_spaceship)
+#if _LIBCPP_STD_VER >= 20 && __has_builtin(__builtin_lt_synthesizes_from_spaceship)
 template <class _LHS, class _RHS>
 struct __default_three_way_comparator<
     _LHS,
     _RHS,
     __enable_if_t<!(is_arithmetic<_LHS>::value && is_arithmetic<_RHS>::value) &&
-                  __builtin_lt_synthesises_from_spaceship(const _LHS&, const _RHS&)>> {
+                  __builtin_lt_synthesizes_from_spaceship(const _LHS&, const _RHS&)>> {
   _LIBCPP_HIDE_FROM_ABI static int operator()(const _LHS& __lhs, const _RHS& __rhs) {
     auto __res = __lhs <=> __rhs;
     if (__res < 0)
diff --git a/libcxx/include/istream b/libcxx/include/istream
index 93def61..7f15521 100644
--- a/libcxx/include/istream
+++ b/libcxx/include/istream
@@ -70,6 +70,7 @@ public:
     basic_istream& getline(char_type* s, streamsize n, char_type delim);
 
     basic_istream& ignore(streamsize n = 1, int_type delim = traits_type::eof());
+    basic_istream& ignore(streamsize n, char_type delim);                         // Since C++26, implemented as a DR
     int_type peek();
     basic_istream& read (char_type* s, streamsize n);
     streamsize readsome(char_type* s, streamsize n);
@@ -172,6 +173,7 @@ template <class Stream, class T>
 #    include <__type_traits/conjunction.h>
 #    include <__type_traits/enable_if.h>
 #    include <__type_traits/is_base_of.h>
+#    include <__type_traits/is_same.h>
 #    include <__type_traits/make_unsigned.h>
 #    include <__utility/declval.h>
 #    include <__utility/forward.h>
@@ -292,6 +294,10 @@ public:
   basic_istream& getline(char_type* __s, streamsize __n, char_type __dlm);
 
   basic_istream& ignore(streamsize __n = 1, int_type __dlm = traits_type::eof());
+  template <class _Tp = char_type, __enable_if_t<is_same<_Tp, char>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI basic_istream& ignore(streamsize __n, char_type __delim) {
+    return ignore(__n, traits_type::to_int_type(__delim));
+  }
   int_type peek();
   basic_istream& read(char_type* __s, streamsize __n);
   streamsize readsome(char_type* __s, streamsize __n);
diff --git a/libcxx/include/string b/libcxx/include/string
index 729a420..cfd6861 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -2552,7 +2552,7 @@ _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST(_LIBCPP_DECLARE, wchar_t)
 #  endif
 #  undef _LIBCPP_DECLARE
 
-#  if _LIBCPP_STD_VER <= 17 || !__has_builtin(__builtin_lt_synthesises_from_spaceship)
+#  if _LIBCPP_STD_VER <= 17 || !__has_builtin(__builtin_lt_synthesizes_from_spaceship)
 template <class _CharT, class _Traits, class _Alloc>
 struct __default_three_way_comparator<basic_string<_CharT, _Traits, _Alloc>, basic_string<_CharT, _Traits, _Alloc> > {
   using __string_t _LIBCPP_NODEBUG = basic_string<_CharT, _Traits, _Alloc>;
diff --git a/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp b/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp
index 42b4855..625b194 100644
--- a/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp
@@ -18,7 +18,7 @@ static_assert(std::__has_default_three_way_comparator<long, int>::value);
 static_assert(std::__has_default_three_way_comparator<long, long>::value);
 static_assert(std::__has_default_three_way_comparator<std::string, std::string>::value);
 
-#if __has_builtin(__builtin_lt_synthesises_from_spaceship)
+#if __has_builtin(__builtin_lt_synthesizes_from_spaceship)
 static_assert(std::__has_default_three_way_comparator<const std::string&, const std::string&>::value);
 static_assert(std::__has_default_three_way_comparator<const std::string&, const std::string_view&>::value);
 static_assert(std::__has_default_three_way_comparator<std::string, std::string_view>::value);
diff --git a/libcxx/test/std/containers/sequences/vector/common.h b/libcxx/test/std/containers/sequences/vector/common.h
index 4af6559..34453f8 100644
--- a/libcxx/test/std/containers/sequences/vector/common.h
+++ b/libcxx/test/std/containers/sequences/vector/common.h
@@ -214,10 +214,10 @@ struct throwing_iterator {
 };
 
 inline void check_new_delete_called() {
-  assert(globalMemCounter.new_called == globalMemCounter.delete_called);
-  assert(globalMemCounter.new_array_called == globalMemCounter.delete_array_called);
-  assert(globalMemCounter.aligned_new_called == globalMemCounter.aligned_delete_called);
-  assert(globalMemCounter.aligned_new_array_called == globalMemCounter.aligned_delete_array_called);
+  ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.new_called == globalMemCounter.delete_called);
+  ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.new_array_called == globalMemCounter.delete_array_called);
+  ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.aligned_new_called == globalMemCounter.aligned_delete_called);
+  ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.aligned_new_array_called == globalMemCounter.aligned_delete_array_called);
 }
 
 template <class T, typename Alloc>
diff --git a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
index 3442019..b3d96c2 100644
--- a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
@@ -12,6 +12,11 @@
 // These tests require locale for non-char paths
 // UNSUPPORTED: no-localization
 
+// In MinGW mode, with optimizations enabled with a DLL, the number of counted
+// allocations mismatches, as some ctor/dtor calls are generated in the
+// calling code, and some are called from the DLL.
+// ADDITIONAL_COMPILE_FLAGS: -DALLOW_MISMATCHING_LIBRRARY_INTERNAL_ALLOCATIONS
+
 // <filesystem>
 
 // class path
diff --git a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
index 5596de7..570d303 100644
--- a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
@@ -12,6 +12,11 @@
 // These tests require locale for non-char paths
 // UNSUPPORTED: no-localization
 
+// In MinGW mode, with optimizations enabled with a DLL, the number of counted
+// allocations mismatches, as some ctor/dtor calls are generated in the
+// calling code, and some are called from the DLL.
+// ADDITIONAL_COMPILE_FLAGS: -DALLOW_MISMATCHING_LIBRRARY_INTERNAL_ALLOCATIONS
+
 // <filesystem>
 
 // class path
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.char_type.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.char_type.pass.cpp
new file mode 100644
index 0000000..d0d174c
--- /dev/null
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.char_type.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
+// <istream>
+
+// basic_istream& ignore(streamsize n, char_type delim);
+
+#include <cassert>
+#include <sstream>
+#include <string>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::istringstream in("\xF0\x9F\xA4\xA1 Clown Face");
+  in.ignore(100, '\xA1'); // Ignore up to '\xA1' delimiter,
+                          // previously might have ignored to EOF.
+
+  assert(in.gcount() == 4); // 4 bytes were ignored.
+  assert(in.peek() == ' '); // Next character is a space.
+
+  std::string str; // Read the next word.
+  in >> str;
+  assert(str == "Clown");
+
+  // Parameter value "-1L" doesn't cause ambiguity with the char_type overload.
+  in.ignore(100, -1L); // Ignore up to EOF, which is the default behavior.
+  assert(in.eof());    // Stream should be at EOF now.
+  assert(in.gcount() == 5);
+
+  return 0;
+}
diff --git a/libcxx/test/support/count_new.h b/libcxx/test/support/count_new.h
index c8169d3..f175bc2 100644
--- a/libcxx/test/support/count_new.h
+++ b/libcxx/test/support/count_new.h
@@ -626,7 +626,11 @@ struct RequireAllocationGuard {
     void requireExactly(std::size_t N) { m_req_alloc = N; m_exactly = true; }
 
     ~RequireAllocationGuard() {
+#ifdef ALLOW_MISMATCHING_LIBRRARY_INTERNAL_ALLOCATIONS
+        ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.checkOutstandingNewEq(static_cast<int>(m_outstanding_new_on_init)));
+#else
         assert(globalMemCounter.checkOutstandingNewEq(static_cast<int>(m_outstanding_new_on_init)));
+#endif
         std::size_t Expect = m_new_count_on_init + m_req_alloc;
         assert(globalMemCounter.checkNewCalledEq(static_cast<int>(Expect)) ||
                (!m_exactly && globalMemCounter.checkNewCalledGreaterThan(static_cast<int>(Expect))));
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 1270f27..ff7ef2d 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -1357,6 +1357,8 @@ SyntheticSection *EhInputSection::getParent() const {
 
 // .eh_frame is a sequence of CIE or FDE records.
 // This function splits an input section into records and returns them.
+// In rare cases (.eh_frame pieces are reordered by a linker script), the
+// relocations may be unordered.
 template <class ELFT> void EhInputSection::split() {
   const RelsOrRelas<ELFT> elfRels = relsOrRelas<ELFT>();
   if (elfRels.areRelocsCrel())
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index bd96c05..84b9b5e 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -6,37 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains platform-independent functions to process relocations.
-// I'll describe the overview of this file here.
+// This file implements the core relocation processing logic. It analyzes
+// relocations and determines what auxiliary data structures (GOT, PLT, copy
+// relocations) need to be created during linking.
 //
-// Simple relocations are easy to handle for the linker. For example,
-// for R_X86_64_PC64 relocs, the linker just has to fix up locations
-// with the relative offsets to the target symbols. It would just be
-// reading records from relocation sections and applying them to output.
+// The main entry point is scanRelocations<ELFT>(), which calls scanSection()
+// to process all relocations within an input section. For each relocation,
+// scan() analyzes the type and target, and determines whether a synthetic
+// section entry or dynamic relocation is needed.
 //
-// But not all relocations are that easy to handle. For example, for
-// R_386_GOTOFF relocs, the linker has to create new GOT entries for
-// symbols if they don't exist, and fix up locations with GOT entry
-// offsets from the beginning of GOT section. So there is more than
-// fixing addresses in relocation processing.
+// Note: This file analyzes what needs to be done but doesn't apply the
+// actual relocations - that happens later in InputSection::writeTo().
+// Instead, it populates Relocation objects in InputSectionBase::relocations
+// and creates necessary synthetic sections (GOT, PLT, etc.).
 //
-// ELF defines a large number of complex relocations.
-//
-// The functions in this file analyze relocations and do whatever needs
-// to be done. It includes, but not limited to, the following.
-//
-//  - create GOT/PLT entries
-//  - create new relocations in .dynsym to let the dynamic linker resolve
-//    them at runtime (since ELF supports dynamic linking, not all
-//    relocations can be resolved at link-time)
-//  - create COPY relocs and reserve space in .bss
-//  - replace expensive relocs (in terms of runtime cost) with cheap ones
-//  - error out infeasible combinations such as PIC and non-relative relocs
-//
-// Note that the functions in this file don't actually apply relocations
-// because it doesn't know about the output file nor the output file buffer.
-// It instead stores Relocation objects to InputSection's Relocations
-// vector to let it apply later in InputSection::writeTo.
+// In addition, this file implements the core Thunk creation logic, called
+// during finalizeAddressDependentContent().
 //
 //===----------------------------------------------------------------------===//
 
@@ -405,22 +390,17 @@ namespace {
 class OffsetGetter {
 public:
   OffsetGetter() = default;
-  explicit OffsetGetter(InputSectionBase &sec) {
-    if (auto *eh = dyn_cast<EhInputSection>(&sec)) {
-      cies = eh->cies;
-      fdes = eh->fdes;
-      i = cies.begin();
-      j = fdes.begin();
-    }
+  explicit OffsetGetter(EhInputSection &sec) {
+    cies = sec.cies;
+    fdes = sec.fdes;
+    i = cies.begin();
+    j = fdes.begin();
   }
 
   // Translates offsets in input sections to offsets in output sections.
   // Given offset must increase monotonically. We assume that Piece is
   // sorted by inputOff.
   uint64_t get(Ctx &ctx, uint64_t off) {
-    if (cies.empty())
-      return off;
-
     while (j != fdes.end() && j->inputOff <= off)
       ++j;
     auto it = j;
@@ -450,13 +430,12 @@ private:
 class RelocationScanner {
 public:
   RelocationScanner(Ctx &ctx) : ctx(ctx) {}
-  template <class ELFT>
-  void scanSection(InputSectionBase &s, bool isEH = false);
+  template <class ELFT> void scanSection(InputSectionBase &s);
+  template <class ELFT> void scanEhSection(EhInputSection &s);
 
 private:
   Ctx &ctx;
   InputSectionBase *sec;
-  OffsetGetter getter;
 
   // End of relocations, used by Mips/PPC64.
   const void *end = nullptr;
@@ -466,14 +445,14 @@ private:
   int64_t computeMipsAddend(const RelTy &rel, RelExpr expr, bool isLocal) const;
   bool isStaticLinkTimeConstant(RelExpr e, RelType type, const Symbol &sym,
                                 uint64_t relOff) const;
-  void processAux(RelExpr expr, RelType type, uint64_t offset, Symbol &sym,
-                  int64_t addend) const;
+  void process(RelExpr expr, RelType type, uint64_t offset, Symbol &sym,
+               int64_t addend) const;
   unsigned handleTlsRelocation(RelExpr expr, RelType type, uint64_t offset,
                                Symbol &sym, int64_t addend);
 
   template <class ELFT, class RelTy>
-  void scanOne(typename Relocs<RelTy>::const_iterator &i);
-  template <class ELFT, class RelTy> void scan(Relocs<RelTy> rels);
+  void scan(typename Relocs<RelTy>::const_iterator &i);
+  template <class ELFT, class RelTy> void scanSectionImpl(Relocs<RelTy> rels);
 };
 } // namespace
 
@@ -961,7 +940,7 @@ static bool canDefineSymbolInExecutable(Ctx &ctx, Symbol &sym) {
 }
 
 // Returns true if a given relocation can be computed at link-time.
-// This only handles relocation types expected in processAux.
+// This only handles relocation types expected in process().
 //
 // For instance, we know the offset from a relocation to its target at
 // link-time if the relocation is PC-relative and refers a
@@ -1052,8 +1031,8 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
 // sections. Given that it is ro, we will need an extra PT_LOAD. This
 // complicates things for the dynamic linker and means we would have to reserve
 // space for the extra PT_LOAD even if we end up not using it.
-void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
-                                   Symbol &sym, int64_t addend) const {
+void RelocationScanner::process(RelExpr expr, RelType type, uint64_t offset,
+                                Symbol &sym, int64_t addend) const {
   // If non-ifunc non-preemptible, change PLT to direct call and optimize GOT
   // indirection.
   const bool isIfunc = sym.isGnuIFunc();
@@ -1493,7 +1472,7 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
 }
 
 template <class ELFT, class RelTy>
-void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
+void RelocationScanner::scan(typename Relocs<RelTy>::const_iterator &i) {
   const RelTy &rel = *i;
   uint32_t symIndex = rel.getSymbol(ctx.arg.isMips64EL);
   Symbol &sym = sec->getFile<ELFT>()->getSymbol(symIndex);
@@ -1511,9 +1490,7 @@ void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
     }
   }
   // Get an offset in an output section this relocation is applied to.
-  uint64_t offset = getter.get(ctx, rel.r_offset);
-  if (offset == uint64_t(-1))
-    return;
+  uint64_t offset = rel.r_offset;
 
   RelExpr expr =
       ctx.target->getRelExpr(type, sym, sec->content().data() + offset);
@@ -1587,7 +1564,7 @@ void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
   }
 
   // Process TLS relocations, including TLS optimizations. Note that
-  // R_TPREL and R_TPREL_NEG relocations are resolved in processAux.
+  // R_TPREL and R_TPREL_NEG relocations are resolved in process().
   //
   // Some RISCV TLSDESC relocations reference a local NOTYPE symbol,
   // but we need to process them in handleTlsRelocation.
@@ -1599,7 +1576,7 @@ void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
     }
   }
 
-  processAux(expr, type, offset, sym, addend);
+  process(expr, type, offset, sym, addend);
 }
 
 // R_PPC64_TLSGD/R_PPC64_TLSLD is required to mark `bl __tls_get_addr` for
@@ -1642,30 +1619,27 @@ static void checkPPC64TLSRelax(InputSectionBase &sec, Relocs<RelTy> rels) {
 }
 
 template <class ELFT, class RelTy>
-void RelocationScanner::scan(Relocs<RelTy> rels) {
+void RelocationScanner::scanSectionImpl(Relocs<RelTy> rels) {
   // Not all relocations end up in Sec->Relocations, but a lot do.
   sec->relocations.reserve(rels.size());
 
   if (ctx.arg.emachine == EM_PPC64)
     checkPPC64TLSRelax<RelTy>(*sec, rels);
 
-  // For EhInputSection, OffsetGetter expects the relocations to be sorted by
-  // r_offset. In rare cases (.eh_frame pieces are reordered by a linker
-  // script), the relocations may be unordered.
   // On SystemZ, all sections need to be sorted by r_offset, to allow TLS
   // relaxation to be handled correctly - see SystemZ::getTlsGdRelaxSkip.
   SmallVector<RelTy, 0> storage;
-  if (isa<EhInputSection>(sec) || ctx.arg.emachine == EM_S390)
+  if (ctx.arg.emachine == EM_S390)
     rels = sortRels(rels, storage);
 
   if constexpr (RelTy::IsCrel) {
     for (auto i = rels.begin(); i != rels.end();)
-      scanOne<ELFT, RelTy>(i);
+      scan<ELFT, RelTy>(i);
   } else {
     // The non-CREL code path has additional check for PPC64 TLS.
     end = static_cast<const void *>(rels.end());
     for (auto i = rels.begin(); i != end;)
-      scanOne<ELFT, RelTy>(i);
+      scan<ELFT, RelTy>(i);
   }
 
   // Sort relocations by offset for more efficient searching for
@@ -1680,17 +1654,36 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
                       });
 }
 
-template <class ELFT>
-void RelocationScanner::scanSection(InputSectionBase &s, bool isEH) {
+template <class ELFT> void RelocationScanner::scanSection(InputSectionBase &s) {
   sec = &s;
-  getter = OffsetGetter(s);
-  const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>(!isEH);
+  const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>();
   if (rels.areRelocsCrel())
-    scan<ELFT>(rels.crels);
+    scanSectionImpl<ELFT>(rels.crels);
   else if (rels.areRelocsRel())
-    scan<ELFT>(rels.rels);
+    scanSectionImpl<ELFT>(rels.rels);
   else
-    scan<ELFT>(rels.relas);
+    scanSectionImpl<ELFT>(rels.relas);
+}
+
+template <class ELFT> void RelocationScanner::scanEhSection(EhInputSection &s) {
+  sec = &s;
+  OffsetGetter getter(s);
+  auto rels = s.rels;
+  s.relocations.reserve(rels.size());
+  for (auto &r : rels) {
+    // Ignore R_*_NONE and other marker relocations.
+    if (r.expr == R_NONE)
+      continue;
+    uint64_t offset = getter.get(ctx, r.offset);
+    // Skip if the relocation offset is within a dead piece.
+    if (offset == uint64_t(-1))
+      continue;
+    Symbol *sym = r.sym;
+    if (sym->isUndefined() &&
+        maybeReportUndefined(ctx, cast<Undefined>(*sym), *sec, offset))
+      continue;
+    process(r.expr, r.type, offset, *sym, r.addend);
+  }
 }
 
 template <class ELFT> void elf::scanRelocations(Ctx &ctx) {
@@ -1725,7 +1718,7 @@ template <class ELFT> void elf::scanRelocations(Ctx &ctx) {
       RelocationScanner scanner(ctx);
       for (Partition &part : ctx.partitions) {
         for (EhInputSection *sec : part.ehFrame->sections)
-          scanner.template scanSection<ELFT>(*sec, /*isEH=*/true);
+          scanner.template scanEhSection<ELFT>(*sec);
         if (part.armExidx && part.armExidx->isLive())
           for (InputSection *sec : part.armExidx->exidxSections)
             if (sec->isLive())
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 228b84d..5645d8a 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1685,31 +1685,7 @@ void CStringSection::writeTo(uint8_t *buf) const {
   }
 }
 
-void CStringSection::finalizeContents() {
-  uint64_t offset = 0;
-  // TODO: Call buildCStringPriorities() to support cstring ordering when
-  // deduplication is off, although this may negatively impact build
-  // performance.
-  for (CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      // See comment above DeduplicatedCStringSection for how alignment is
-      // handled.
-      uint32_t pieceAlign = 1
-                            << llvm::countr_zero(isec->align | piece.inSecOff);
-      offset = alignToPowerOf2(offset, pieceAlign);
-      piece.outSecOff = offset;
-      isec->isFinal = true;
-      StringRef string = isec->getStringRef(i);
-      offset += string.size() + 1; // account for null terminator
-    }
-  }
-  size = offset;
-}
-
-// Mergeable cstring literals are found under the __TEXT,__cstring section. In
-// contrast to ELF, which puts strings that need different alignments into
+// In contrast to ELF, which puts strings that need different alignments into
 // different sections, clang's Mach-O backend puts them all in one section.
 // Strings that need to be aligned have the .p2align directive emitted before
 // them, which simply translates into zero padding in the object file. In other
@@ -1744,8 +1720,33 @@ void CStringSection::finalizeContents() {
 // requires its operand addresses to be 16-byte aligned). However, there will
 // typically also be other cstrings in the same file that aren't used via SIMD
 // and don't need this alignment. They will be emitted at some arbitrary address
-// `A`, but ld64 will treat them as being 16-byte aligned with an offset of `16
-// % A`.
+// `A`, but ld64 will treat them as being 16-byte aligned with an offset of
+// `16 % A`.
+static uint8_t getStringPieceAlignment(const CStringInputSection *isec,
+                                       const StringPiece &piece) {
+  return llvm::countr_zero(isec->align | piece.inSecOff);
+}
+
+void CStringSection::finalizeContents() {
+  uint64_t offset = 0;
+  // TODO: Call buildCStringPriorities() to support cstring ordering when
+  // deduplication is off, although this may negatively impact build
+  // performance.
+  for (CStringInputSection *isec : inputs) {
+    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
+      if (!piece.live)
+        continue;
+      uint32_t pieceAlign = 1 << getStringPieceAlignment(isec, piece);
+      offset = alignToPowerOf2(offset, pieceAlign);
+      piece.outSecOff = offset;
+      isec->isFinal = true;
+      StringRef string = isec->getStringRef(i);
+      offset += string.size() + 1; // account for null terminator
+    }
+  }
+  size = offset;
+}
+
 void DeduplicatedCStringSection::finalizeContents() {
   // Find the largest alignment required for each string.
   for (const CStringInputSection *isec : inputs) {
@@ -1754,7 +1755,7 @@ void DeduplicatedCStringSection::finalizeContents() {
         continue;
       auto s = isec->getCachedHashStringRef(i);
       assert(isec->align != 0);
-      uint8_t trailingZeros = llvm::countr_zero(isec->align | piece.inSecOff);
+      uint8_t trailingZeros = getStringPieceAlignment(isec, piece);
       auto it = stringOffsetMap.insert(
           std::make_pair(s, StringOffset(trailingZeros)));
       if (!it.second && it.first->second.trailingZeros < trailingZeros)
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 6ea1ea0..566dde6 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -44,6 +44,9 @@ MinGW Improvements
 MachO Improvements
 ------------------
 
+* ``--separate-cstring-literal-sections`` emits cstring literal sections into sections defined by their section name.
+  (`#158720 <https://github.com/llvm/llvm-project/pull/158720>`_)
+
 WebAssembly Improvements
 ------------------------
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 1948f51..a5aaf1f 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5033,6 +5033,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
     case clang::BuiltinType::VectorPair:
     case clang::BuiltinType::VectorQuad:
     case clang::BuiltinType::DMR1024:
+    case clang::BuiltinType::DMR2048:
       break;
 
     // ARM -- Scalable Vector Extension
diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst
index 8301b95..da92ed3 100644
--- a/llvm/docs/CIBestPractices.rst
+++ b/llvm/docs/CIBestPractices.rst
@@ -9,11 +9,11 @@ This document contains a list of guidelines and best practices to use when
 working on LLVM's CI systems. These are intended to keep our actions reliable,
 consistent, and secure.
 
-Github Actions Best Practices
+GitHub Actions Best Practices
 =============================
 
 This section contains information on best practices/guidelines when working on
-LLVM's github actions workflows.
+LLVM's GitHub actions workflows.
 
 Disabling Jobs In Forks
 -----------------------
@@ -35,7 +35,7 @@ jobs specified within a workflow:
       if: github.repository_owner == 'llvm'
 
 We choose to use ``github.repository_owner`` rather than ``github.repository``
-to enable these workflows to run in forks inside the LLVM organization such as
+to enable these workflows to run in forks inside the LLVM organization, such as
 the ClangIR fork.
 
 There are some exceptions to this rule where ``github.repository`` might be
@@ -46,7 +46,7 @@ release tasks, which should not run anywhere else.
 Hash Pinning Dependencies
 -------------------------
 
-Github Actions allows the use of actions from other repositories as steps in
+GitHub Actions allows the use of actions from other repositories as steps in
 jobs. We take advantage of various actions for a variety of different tasks,
 but especially tasks like checking out the repository, and
 downloading/uploading build caches. These actions are typically versioned with
@@ -59,9 +59,9 @@ just a release, which looks like the following:
       uses: actions/checkout@v4
 
 However, it is best practice to specify an exact commit SHA from which to pull
-the action from, noting the version in a comment:
+the action, noting the version in a comment:
 
-We plan on revisiting this recommendation once Github's immutable actions have
+We plan on revisiting this recommendation once GitHub's immutable actions have
 been rolled out as GA.
 
 .. code-block:: yaml
@@ -72,11 +72,11 @@ been rolled out as GA.
 
 This is beneficial for two reasons: reliability and security. Specifying an
 exact SHA rather than just a major version ensures we end up running the same
-action originally specified when the workflow as authored and/or updated,
+action originally specified when the workflow was authored and/or updated,
 and that no breaking changes sneak in from new versions of a workflow being
 released. However, this effect could also be achieved by specifying an exact
 dot release. The biggest reason to prefer hash pinned dependencies is security.
-Release assets on Github are mutable, allowing an attacker to change the code
+Release assets on GitHub are mutable, allowing an attacker to change the code
 within a specific version of an action after the fact, potentially stealing
 sensitive tokens and credentials. Hash pinning the dependencies prevents this
 as the hash would change with the code.
@@ -84,10 +84,10 @@ as the hash would change with the code.
 Using Versioned Runner Images
 -----------------------------
 
-Github actions allows the use of either specifically versioned runner images
+GitHub actions allows the use of either specifically versioned runner images
 (e.g., ``ubuntu-22.04``), or just the latest runner image
 (e.g., ``ubuntu-latest``). It is best practice to use explicitly versioned
-runner images. This prevents breakages when Github rolls the latest runner
+runner images. This prevents breakages when GitHub rolls the latest runner
 image to a new version with potentially breaking changes, instead allowing us
 to explicitly opt-in to using the new image when we have done sufficient
 testing to ensure that our existing workflows work as expected in the new
@@ -112,7 +112,7 @@ the principle of least privilege.
 Ensuring Workflows Run on the Correct Events
 --------------------------------------------
 
-Github allows workflows to run on a multitude of events and it is important to
+GitHub allows workflows to run on a multitude of events, and it is important to
 configure a workflow such that it triggers on the correct events. There are
 two main best practices around events that trigger workflows:
 
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 7b1a6ce..f9e2e4a 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -120,6 +120,8 @@ on support follow.
      ``H``             Assembly Support
      ``M``             Supported
      ``Q``             Assembly Support
+     ``Sdext``         Assembly Support (`See note <#riscv-debug-specification-note>`__)
+     ``Sdtrig``        Assembly Support (`See note <#riscv-debug-specification-note>`__)
      ``Sha``           Supported
      ``Shcounterenw``  Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
      ``Shgatpa``       Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
@@ -132,6 +134,7 @@ on support follow.
      ``Smcdeleg``      Supported
      ``Smcntrpmf``     Supported
      ``Smcsrind``      Supported
+     ``Smctr``         Assembly Support
      ``Smdbltrp``      Supported
      ``Smepmp``        Supported
      ``Smmpm``         Supported
@@ -144,6 +147,7 @@ on support follow.
      ``Sscofpmf``      Assembly Support
      ``Sscounterenw``  Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
      ``Sscsrind``      Supported
+     ``Ssctr``         Assembly Support
      ``Ssdbltrp``      Supported
      ``Ssnpm``         Supported
      ``Sspm``          Supported
@@ -306,6 +310,10 @@ Supported
 ``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccamoc``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
   These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`__.  They do not introduce any new features themselves, but instead describe existing hardware features.
 
+.. _riscv-debug-specification-note:
+
+``Sdext``, ``Sdtrig`` `The RISC-V Debug Specification <https://github.com/riscv/riscv-debug-spec/releases/download/1.0/riscv-debug-specification.pdf>`__.
+
 .. _riscv-zacas-note:
 
 ``Zacas``
@@ -337,12 +345,6 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zvbc32e``, ``experimental-zvkgs``
   LLVM implements the `0.7 release specification <https://github.com/user-attachments/files/16450464/riscv-crypto-spec-vector-extra_v0.0.7.pdf>`__.
 
-``experimental-sdext``, ``experimental-sdtrig``
-  LLVM implements the `1.0-rc4 specification <https://github.com/riscv/riscv-debug-spec/releases/download/1.0.0-rc4/riscv-debug-specification.pdf>`__.
-
-``experimental-smctr``, ``experimental-ssctr``
-  LLVM implements the `1.0-rc3 specification <https://github.com/riscv/riscv-control-transfer-records/releases/tag/v1.0_rc3>`__.
-
 ``experimental-svukte``
   LLVM implements the `0.3 draft specification <https://github.com/riscv/riscv-isa-manual/pull/1564>`__.
 
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index fdefc53..b6cd4b4 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -232,7 +232,7 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
    * - ``SPV_INTEL_int4``
      - Adds support for 4-bit integer type, and allow this type to be used in cooperative matrices.
    * - ``SPV_KHR_float_controls2``
-     - Adds ability to specify the floating-point environment in shaders. It can be used on whole modules and individual instructions.
+     - Adds execution modes and decorations to control floating-point computations in both kernels and shaders. It can be used on whole modules and individual instructions.
 
 SPIR-V representation in LLVM IR
 ================================
@@ -589,3 +589,31 @@ Group and Subgroup Operations
 For workgroup and subgroup operations, LLVM uses function calls to represent SPIR-V's
 group-based instructions. These builtins facilitate group synchronization, data sharing,
 and collective operations essential for efficient parallel computation.
+
+SPIR-V Instructions Mapped to LLVM Metadata
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Some SPIR-V instructions don't have a direct equivalent in the LLVM IR language. To
+address this, the SPIR-V Target uses different specific LLVM named metadata to convey
+the necessary information. The SPIR-V specification allows multiple module-scope
+instructions, where as LLVM named metadata must be unique. Therefore, the encoding of
+such instructions has the following format:
+
+.. code-block:: llvm
+
+  !spirv.<OpCodeName> = !{!<InstructionMetadata1>, !<InstructionMetadata2>, ..}
+  !<InstructionMetadata1> = !{<Operand1>, <Operand2>, ..}
+  !<InstructionMetadata2> = !{<Operand1>, <Operand2>, ..}
+
+Below, you will find the mappings between SPIR-V instruction and their corresponding
+LLVM IR representations.
+
++--------------------+---------------------------------------------------------+
+| SPIR-V instruction | LLVM IR                                                 |
++====================+=========================================================+
+| OpExecutionMode    | .. code-block:: llvm                                    |
+|                    |                                                         |
+|                    |    !spirv.ExecutionMode = !{!0}                         |
+|                    |    !0 = !{void @worker, i32 30, i32 262149}             |
+|                    |    ; Set execution mode with id 30 (VecTypeHint) and    |
+|                    |    ; literal `262149` operand.                          |
++--------------------+---------------------------------------------------------+
diff --git a/llvm/include/llvm/ADT/AllocatorList.h b/llvm/include/llvm/ADT/AllocatorList.h
index 04d0afc..2716b83 100644
--- a/llvm/include/llvm/ADT/AllocatorList.h
+++ b/llvm/include/llvm/ADT/AllocatorList.h
@@ -155,8 +155,8 @@ public:
     std::swap(getAlloc(), RHS.getAlloc());
   }
 
-  bool empty() { return List.empty(); }
-  size_t size() { return List.size(); }
+  [[nodiscard]] bool empty() const { return List.empty(); }
+  [[nodiscard]] size_t size() const { return List.size(); }
 
   iterator begin() { return iterator(List.begin()); }
   iterator end() { return iterator(List.end()); }
diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
index fb91690..448d100 100644
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -547,7 +547,8 @@ namespace llvm {
   }
 
   template <typename T>
-  inline bool operator==(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
+  [[nodiscard]] inline bool operator==(const SmallVectorImpl<T> &LHS,
+                                       ArrayRef<T> RHS) {
     return ArrayRef<T>(LHS).equals(RHS);
   }
 
@@ -557,7 +558,8 @@ namespace llvm {
   }
 
   template <typename T>
-  inline bool operator!=(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
+  [[nodiscard]] inline bool operator!=(const SmallVectorImpl<T> &LHS,
+                                       ArrayRef<T> RHS) {
     return !(LHS == RHS);
   }
 
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 141816c3..7fb0dbe 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -408,15 +408,6 @@ public:
                                 const CycleT *);
 
 protected:
-  /// \brief Value/block pair representing a single phi input.
-  struct PhiInput {
-    ConstValueRefT value;
-    BlockT *predBlock;
-
-    PhiInput(ConstValueRefT value, BlockT *predBlock)
-        : value(value), predBlock(predBlock) {}
-  };
-
   const ContextT &Context;
   const FunctionT &F;
   const CycleInfoT &CI;
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 52ab385..84b4ad7 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -724,8 +724,9 @@ public:
 
   /// Return true if the block BB needs to be predicated in order for the loop
   /// to be vectorized.
-  LLVM_ABI static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
-                                             DominatorTree *DT);
+  LLVM_ABI static bool blockNeedsPredication(const BasicBlock *BB,
+                                             const Loop *TheLoop,
+                                             const DominatorTree *DT);
 
   /// Returns true if value \p V is loop invariant.
   LLVM_ABI bool isInvariant(Value *V) const;
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 9ea127d..300addd 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -367,11 +367,11 @@ def aarch64mfp8 : ValueType<8,  253>;  // 8-bit value in FPR (AArch64)
 def c64 : VTCheriCapability<64, 254>;   // 64-bit CHERI capability value
 def c128 : VTCheriCapability<128, 255>; // 128-bit CHERI capability value
 
+let isNormalValueType = false in {
 // Pseudo valuetype mapped to the current CHERI capability pointer size.
 // Should only be used in TableGen.
 def cPTR : VTAny<503>;
 
-let isNormalValueType = false in {
 def token      : ValueType<0, 504>;  // TokenTy
 def MetadataVT : ValueType<0, 505> { // Metadata
   let LLVMName = "Metadata";
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 4af9ffc..81fbfbf 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1919,62 +1919,62 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_vpdpbssd_128
       : ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbssd_256
       : ClangBuiltin<"__builtin_ia32_vpdpbssd256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbssds_128
       : ClangBuiltin<"__builtin_ia32_vpdpbssds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbssds_256
       : ClangBuiltin<"__builtin_ia32_vpdpbssds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbsud_128
       : ClangBuiltin<"__builtin_ia32_vpdpbsud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbsud_256
       : ClangBuiltin<"__builtin_ia32_vpdpbsud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbsuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpbsuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbsuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpbsuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbuud_128
       : ClangBuiltin<"__builtin_ia32_vpdpbuud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbuud_256
       : ClangBuiltin<"__builtin_ia32_vpdpbuud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbuuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpbuuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbuuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpbuuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
 
   def int_x86_avx2_vpdpwsud_128
@@ -5000,32 +5000,32 @@ let TargetPrefix = "x86" in {
   def int_x86_avx10_vpdpbssd_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbssd512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbssds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbssds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbsud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbsud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbsuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbsuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbuud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbuud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbuuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbuuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   // VNNI INT16
   def int_x86_avx10_vpdpwsud_512 :
diff --git a/llvm/include/llvm/Support/Mustache.h b/llvm/include/llvm/Support/Mustache.h
index 781ec55..ee9f406 100644
--- a/llvm/include/llvm/Support/Mustache.h
+++ b/llvm/include/llvm/Support/Mustache.h
@@ -85,6 +85,14 @@ using SectionLambda = std::function<llvm::json::Value(std::string)>;
 
 class ASTNode;
 using AstPtr = std::unique_ptr<ASTNode>;
+using EscapeMap = DenseMap<char, std::string>;
+
+struct MustacheContext {
+  StringMap<AstPtr> Partials;
+  StringMap<Lambda> Lambdas;
+  StringMap<SectionLambda> SectionLambdas;
+  EscapeMap Escapes;
+};
 
 // A Template represents the container for the AST and the partials
 // and Lambdas that are registered with it.
@@ -118,10 +126,7 @@ public:
   LLVM_ABI void overrideEscapeCharacters(DenseMap<char, std::string> Escapes);
 
 private:
-  StringMap<AstPtr> Partials;
-  StringMap<Lambda> Lambdas;
-  StringMap<SectionLambda> SectionLambdas;
-  DenseMap<char, std::string> Escapes;
+  MustacheContext Ctx;
   AstPtr Tree;
 };
 } // namespace llvm::mustache
diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h
index 3eb7c0b..dc03285 100644
--- a/llvm/include/llvm/Support/TrailingObjects.h
+++ b/llvm/include/llvm/Support/TrailingObjects.h
@@ -284,11 +284,8 @@ public:
   /// (which must be one of those specified in the class template). The
   /// array may have zero or more elements in it.
   template <typename T> T *getTrailingObjects() {
-    verifyTrailingObjectsAssertions<true>();
-    // Forwards to an impl function with overloads, since member
-    // function templates can't be specialized.
-    return this->getTrailingObjectsImpl(
-        static_cast<BaseTy *>(this), TrailingObjectsBase::OverloadToken<T>());
+    return const_cast<T *>(
+        static_cast<const TrailingObjects *>(this)->getTrailingObjects<T>());
   }
 
   // getTrailingObjects() specialization for a single trailing type.
@@ -306,13 +303,8 @@ public:
   }
 
   FirstTrailingType *getTrailingObjects() {
-    static_assert(sizeof...(TrailingTys) == 1,
-                  "Can use non-templated getTrailingObjects() only when there "
-                  "is a single trailing type");
-    verifyTrailingObjectsAssertions<false>();
-    return this->getTrailingObjectsImpl(
-        static_cast<BaseTy *>(this),
-        TrailingObjectsBase::OverloadToken<FirstTrailingType>());
+    return const_cast<FirstTrailingType *>(
+        static_cast<const TrailingObjects *>(this)->getTrailingObjects());
   }
 
   // Functions that return the trailing objects as ArrayRefs.
@@ -342,9 +334,8 @@ public:
   }
 
   template <typename T> T *getTrailingObjectsNonStrict() {
-    verifyTrailingObjectsAssertions<false>();
-    return this->getTrailingObjectsImpl(
-        static_cast<BaseTy *>(this), TrailingObjectsBase::OverloadToken<T>());
+    return const_cast<T *>(static_cast<const TrailingObjects *>(this)
+                               ->getTrailingObjectsNonStrict<T>());
   }
 
   template <typename T>
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 29d1c68..0a7ae15 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -179,7 +179,7 @@ public:
   /// This function tells the caller whether the element count is known at
   /// compile time to be a multiple of the scalar value RHS.
   constexpr bool isKnownMultipleOf(ScalarTy RHS) const {
-    return getKnownMinValue() % RHS == 0;
+    return RHS != 0 && getKnownMinValue() % RHS == 0;
   }
 
   /// Returns whether or not the callee is known to be a multiple of RHS.
@@ -191,7 +191,8 @@ public:
     // x % y == 0 !=> x % (vscale * y) == 0
     if (!isScalable() && RHS.isScalable())
       return false;
-    return getKnownMinValue() % RHS.getKnownMinValue() == 0;
+    return RHS.getKnownMinValue() != 0 &&
+           getKnownMinValue() % RHS.getKnownMinValue() == 0;
   }
 
   // Return the minimum value with the assumption that the count is exact.
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 87fae92..05f7ac6 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2856,8 +2856,9 @@ void LoopAccessInfo::emitUnsafeDependenceRemark() {
   }
 }
 
-bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
-                                           DominatorTree *DT)  {
+bool LoopAccessInfo::blockNeedsPredication(const BasicBlock *BB,
+                                           const Loop *TheLoop,
+                                           const DominatorTree *DT) {
   assert(TheLoop->contains(BB) && "Unknown block used");
 
   // Blocks that do not dominate the latch need predication.
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index fb3e648..729a57e 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1203,6 +1203,18 @@ bool PeepholeOptimizer::optimizeCoalescableCopyImpl(Rewriter &&CpyRewriter) {
     if (!NewSrc.Reg)
       continue;
 
+    if (NewSrc.SubReg) {
+      // Verify the register class supports the subregister index. ARM's
+      // copy-like queries return register:subreg pairs where the register's
+      // current class does not directly support the subregister index.
+      const TargetRegisterClass *RC = MRI->getRegClass(NewSrc.Reg);
+      const TargetRegisterClass *WithSubRC =
+          TRI->getSubClassWithSubReg(RC, NewSrc.SubReg);
+      if (!MRI->constrainRegClass(NewSrc.Reg, WithSubRC))
+        continue;
+      Changed = true;
+    }
+
     // Rewrite source.
     if (CpyRewriter.RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) {
       // We may have extended the live-range of NewSrc, account for that.
@@ -1275,6 +1287,18 @@ MachineInstr &PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
   const TargetRegisterClass *DefRC = MRI->getRegClass(Def.Reg);
   Register NewVReg = MRI->createVirtualRegister(DefRC);
 
+  if (NewSrc.SubReg) {
+    const TargetRegisterClass *NewSrcRC = MRI->getRegClass(NewSrc.Reg);
+    const TargetRegisterClass *WithSubRC =
+        TRI->getSubClassWithSubReg(NewSrcRC, NewSrc.SubReg);
+
+    // The new source may not directly support the subregister, but we should be
+    // able to assume it is constrainable to support the subregister (otherwise
+    // ValueTracker was lying and reported a useless value).
+    if (!MRI->constrainRegClass(NewSrc.Reg, WithSubRC))
+      llvm_unreachable("replacement register cannot support subregister");
+  }
+
   MachineInstr *NewCopy =
       BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
               TII->get(TargetOpcode::COPY), NewVReg)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 77df4b4..204e1f0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11849,9 +11849,7 @@ static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS,
   if (!VT.isFloatingPoint())
     return false;
 
-  const TargetOptions &Options = DAG.getTarget().Options;
-
-  return (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) &&
+  return Flags.hasNoSignedZeros() &&
          TLI.isProfitableToCombineMinNumMaxNum(VT) &&
          (Flags.hasNoNaNs() ||
           (DAG.isKnownNeverNaN(RHS) && DAG.isKnownNeverNaN(LHS)));
@@ -17351,7 +17349,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
-  bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();
+  bool NoSignedZero = Flags.hasNoSignedZeros();
 
   // Is the node an FMUL and contractable either due to global flags or
   // SDNodeFlags.
@@ -18327,11 +18325,9 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
       return matcher.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2);
   }
 
-  // FIXME: use fast math flags instead of Options.UnsafeFPMath
-  // TODO: Finally migrate away from global TargetOptions.
   if ((Options.NoNaNsFPMath && Options.NoInfsFPMath) ||
       (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs())) {
-    if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros() ||
+    if (N->getFlags().hasNoSignedZeros() ||
         (N2CFP && !N2CFP->isExactlyValue(-0.0))) {
       if (N0CFP && N0CFP->isZero())
         return N2;
@@ -18636,8 +18632,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   }
 
   // Fold X/Sqrt(X) -> Sqrt(X)
-  if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
-      Flags.hasAllowReassociation())
+  if (Flags.hasNoSignedZeros() && Flags.hasAllowReassociation())
     if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
       return N1;
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
index f23fb34..5f956b1 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
@@ -365,6 +365,10 @@ private:
     uint32_t Type = Rel.getType(false);
     int64_t Addend = Rel.r_addend;
 
+    // ignore
+    if (Type == ELF::R_LARCH_MARK_LA)
+      return Error::success();
+
     if (Type == ELF::R_LARCH_RELAX) {
       if (BlockToFix.edges_empty())
         return make_error<StringError>(
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index d626803..dd1b1d3 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -781,6 +781,9 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
   default:
     report_fatal_error("Relocation type not implemented yet!");
     break;
+  case ELF::R_LARCH_MARK_LA:
+    // ignore
+    break;
   case ELF::R_LARCH_32:
     support::ulittle32_t::ref{TargetPtr} =
         static_cast<uint32_t>(Value + Addend);
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 5385b1f..f28b989 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -594,6 +594,42 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
     return false; // No other 'x86.avx512.*'.
   }
 
+  if (Name.consume_front("avx2.vpdpb")) {
+    // Added in 21.1
+    ID = StringSwitch<Intrinsic::ID>(Name)
+             .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128)
+             .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256)
+             .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128)
+             .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256)
+             .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128)
+             .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256)
+             .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128)
+             .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256)
+             .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128)
+             .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256)
+             .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128)
+             .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256)
+             .Default(Intrinsic::not_intrinsic);
+    if (ID != Intrinsic::not_intrinsic)
+      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    return false; // No other 'x86.avx2.*'
+  }
+
+  if (Name.consume_front("avx10.vpdpb")) {
+    // Added in 21.1
+    ID = StringSwitch<Intrinsic::ID>(Name)
+             .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512)
+             .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512)
+             .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512)
+             .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512)
+             .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512)
+             .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512)
+             .Default(Intrinsic::not_intrinsic);
+    if (ID != Intrinsic::not_intrinsic)
+      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    return false; // No other 'x86.avx10.*'
+  }
+
   if (Name.consume_front("avx512bf16.")) {
     // Added in 9.0
     ID = StringSwitch<Intrinsic::ID>(Name)
@@ -5224,7 +5260,25 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   case Intrinsic::x86_avx512_vpdpbusd_512:
   case Intrinsic::x86_avx512_vpdpbusds_128:
   case Intrinsic::x86_avx512_vpdpbusds_256:
-  case Intrinsic::x86_avx512_vpdpbusds_512: {
+  case Intrinsic::x86_avx512_vpdpbusds_512:
+  case Intrinsic::x86_avx2_vpdpbssd_128:
+  case Intrinsic::x86_avx2_vpdpbssd_256:
+  case Intrinsic::x86_avx10_vpdpbssd_512:
+  case Intrinsic::x86_avx2_vpdpbssds_128:
+  case Intrinsic::x86_avx2_vpdpbssds_256:
+  case Intrinsic::x86_avx10_vpdpbssds_512:
+  case Intrinsic::x86_avx2_vpdpbsud_128:
+  case Intrinsic::x86_avx2_vpdpbsud_256:
+  case Intrinsic::x86_avx10_vpdpbsud_512:
+  case Intrinsic::x86_avx2_vpdpbsuds_128:
+  case Intrinsic::x86_avx2_vpdpbsuds_256:
+  case Intrinsic::x86_avx10_vpdpbsuds_512:
+  case Intrinsic::x86_avx2_vpdpbuud_128:
+  case Intrinsic::x86_avx2_vpdpbuud_256:
+  case Intrinsic::x86_avx10_vpdpbuud_512:
+  case Intrinsic::x86_avx2_vpdpbuuds_128:
+  case Intrinsic::x86_avx2_vpdpbuuds_256:
+  case Intrinsic::x86_avx10_vpdpbuuds_512: {
     unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / 8;
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 4e8f359..e5e062d 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -1000,14 +1000,12 @@ Align Value::getPointerAlignment(const DataLayout &DL) const {
       ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
       return Align(CI->getLimitedValue());
     }
-  } else if (auto *CstPtr = dyn_cast<Constant>(this)) {
-    // Strip pointer casts to avoid creating unnecessary ptrtoint expression
-    // if the only "reduction" is combining a bitcast + ptrtoint.
-    CstPtr = CstPtr->stripPointerCasts();
-    if (auto *CstInt = dyn_cast_or_null<ConstantInt>(ConstantExpr::getPtrToInt(
-            const_cast<Constant *>(CstPtr), DL.getIntPtrType(getType()),
-            /*OnlyIfReduced=*/true))) {
-      size_t TrailingZeros = CstInt->getValue().countr_zero();
+  } else if (auto *CE = dyn_cast<ConstantExpr>(this)) {
+    // Determine the alignment of inttoptr(C).
+    if (CE->getOpcode() == Instruction::IntToPtr &&
+        isa<ConstantInt>(CE->getOperand(0))) {
+      ConstantInt *IntPtr = cast<ConstantInt>(CE->getOperand(0));
+      size_t TrailingZeros = IntPtr->getValue().countr_zero();
       // While the actual alignment may be large, elsewhere we have
       // an arbitrary upper alignmet limit, so let's clamp to it.
       return Align(TrailingZeros < Value::MaxAlignmentExponent
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 8da6fdb..646d7a0 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -7,9 +7,14 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Support/Mustache.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
+#include <optional>
 #include <sstream>
 
+#define DEBUG_TYPE "mustache"
+
 using namespace llvm;
 using namespace llvm::mustache;
 
@@ -62,6 +67,7 @@ public:
     InvertSectionOpen,
     UnescapeVariable,
     Comment,
+    SetDelimiter,
   };
 
   Token(std::string Str)
@@ -102,6 +108,8 @@ public:
       return Type::Partial;
     case '&':
       return Type::UnescapeVariable;
+    case '=':
+      return Type::SetDelimiter;
     default:
       return Type::Variable;
     }
@@ -130,26 +138,17 @@ public:
     InvertSection,
   };
 
-  ASTNode(llvm::StringMap<AstPtr> &Partials, llvm::StringMap<Lambda> &Lambdas,
-          llvm::StringMap<SectionLambda> &SectionLambdas, EscapeMap &Escapes)
-      : Partials(Partials), Lambdas(Lambdas), SectionLambdas(SectionLambdas),
-        Escapes(Escapes), Ty(Type::Root), Parent(nullptr),
-        ParentContext(nullptr) {}
+  ASTNode(MustacheContext &Ctx)
+      : Ctx(Ctx), Ty(Type::Root), Parent(nullptr), ParentContext(nullptr) {}
 
-  ASTNode(std::string Body, ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
-          llvm::StringMap<Lambda> &Lambdas,
-          llvm::StringMap<SectionLambda> &SectionLambdas, EscapeMap &Escapes)
-      : Partials(Partials), Lambdas(Lambdas), SectionLambdas(SectionLambdas),
-        Escapes(Escapes), Ty(Type::Text), Body(std::move(Body)), Parent(Parent),
+  ASTNode(MustacheContext &Ctx, std::string Body, ASTNode *Parent)
+      : Ctx(Ctx), Ty(Type::Text), Body(std::move(Body)), Parent(Parent),
         ParentContext(nullptr) {}
 
   // Constructor for Section/InvertSection/Variable/UnescapeVariable Nodes
-  ASTNode(Type Ty, Accessor Accessor, ASTNode *Parent,
-          llvm::StringMap<AstPtr> &Partials, llvm::StringMap<Lambda> &Lambdas,
-          llvm::StringMap<SectionLambda> &SectionLambdas, EscapeMap &Escapes)
-      : Partials(Partials), Lambdas(Lambdas), SectionLambdas(SectionLambdas),
-        Escapes(Escapes), Ty(Ty), Parent(Parent),
-        AccessorValue(std::move(Accessor)), ParentContext(nullptr) {}
+  ASTNode(MustacheContext &Ctx, Type Ty, Accessor Accessor, ASTNode *Parent)
+      : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(std::move(Accessor)),
+        ParentContext(nullptr) {}
 
   void addChild(AstPtr Child) { Children.emplace_back(std::move(Child)); };
 
@@ -173,10 +172,15 @@ private:
 
   const llvm::json::Value *findContext();
 
-  StringMap<AstPtr> &Partials;
-  StringMap<Lambda> &Lambdas;
-  StringMap<SectionLambda> &SectionLambdas;
-  EscapeMap &Escapes;
+  void renderRoot(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderText(raw_ostream &OS);
+  void renderPartial(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderVariable(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderUnescapeVariable(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderSection(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderInvertSection(const json::Value &CurrentCtx, raw_ostream &OS);
+
+  MustacheContext &Ctx;
   Type Ty;
   size_t Indentation = 0;
   std::string RawBody;
@@ -189,29 +193,18 @@ private:
 };
 
 // A wrapper for arena allocator for ASTNodes
-AstPtr createRootNode(llvm::StringMap<AstPtr> &Partials,
-                      llvm::StringMap<Lambda> &Lambdas,
-                      llvm::StringMap<SectionLambda> &SectionLambdas,
-                      EscapeMap &Escapes) {
-  return std::make_unique<ASTNode>(Partials, Lambdas, SectionLambdas, Escapes);
+static AstPtr createRootNode(MustacheContext &Ctx) {
+  return std::make_unique<ASTNode>(Ctx);
 }
 
-AstPtr createNode(ASTNode::Type T, Accessor A, ASTNode *Parent,
-                  llvm::StringMap<AstPtr> &Partials,
-                  llvm::StringMap<Lambda> &Lambdas,
-                  llvm::StringMap<SectionLambda> &SectionLambdas,
-                  EscapeMap &Escapes) {
-  return std::make_unique<ASTNode>(T, std::move(A), Parent, Partials, Lambdas,
-                                   SectionLambdas, Escapes);
+static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T, Accessor A,
+                         ASTNode *Parent) {
+  return std::make_unique<ASTNode>(Ctx, T, std::move(A), Parent);
 }
 
-AstPtr createTextNode(std::string Body, ASTNode *Parent,
-                      llvm::StringMap<AstPtr> &Partials,
-                      llvm::StringMap<Lambda> &Lambdas,
-                      llvm::StringMap<SectionLambda> &SectionLambdas,
-                      EscapeMap &Escapes) {
-  return std::make_unique<ASTNode>(std::move(Body), Parent, Partials, Lambdas,
-                                   SectionLambdas, Escapes);
+static AstPtr createTextNode(MustacheContext &Ctx, std::string Body,
+                             ASTNode *Parent) {
+  return std::make_unique<ASTNode>(Ctx, std::move(Body), Parent);
 }
 
 // Function to check if there is meaningful text behind.
@@ -226,7 +219,7 @@ AstPtr createTextNode(std::string Body, ASTNode *Parent,
 // and the current token is the second token.
 // For example:
 //  "{{#Section}}"
-bool hasTextBehind(size_t Idx, const ArrayRef<Token> &Tokens) {
+static bool hasTextBehind(size_t Idx, const ArrayRef<Token> &Tokens) {
   if (Idx == 0)
     return true;
 
@@ -242,7 +235,7 @@ bool hasTextBehind(size_t Idx, const ArrayRef<Token> &Tokens) {
 // Function to check if there's no meaningful text ahead.
 // We determine if a token has text ahead if the left of previous
 // token does not start with a newline.
-bool hasTextAhead(size_t Idx, const ArrayRef<Token> &Tokens) {
+static bool hasTextAhead(size_t Idx, const ArrayRef<Token> &Tokens) {
   if (Idx >= Tokens.size() - 1)
     return true;
 
@@ -255,11 +248,11 @@ bool hasTextAhead(size_t Idx, const ArrayRef<Token> &Tokens) {
   return !TokenBody.starts_with("\r\n") && !TokenBody.starts_with("\n");
 }
 
-bool requiresCleanUp(Token::Type T) {
+static bool requiresCleanUp(Token::Type T) {
   // We must clean up all the tokens that could contain child nodes.
   return T == Token::Type::SectionOpen || T == Token::Type::InvertSectionOpen ||
          T == Token::Type::SectionClose || T == Token::Type::Comment ||
-         T == Token::Type::Partial;
+         T == Token::Type::Partial || T == Token::Type::SetDelimiter;
 }
 
 // Adjust next token body if there is no text ahead.
@@ -268,7 +261,7 @@ bool requiresCleanUp(Token::Type T) {
 //  "{{! Comment }} \nLine 2"
 // would be considered as no text ahead and should be rendered as
 //  " Line 2"
-void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
+static void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
   Token &NextToken = Tokens[Idx + 1];
   StringRef NextTokenBody = NextToken.TokenBody;
   // Cut off the leading newline which could be \n or \r\n.
@@ -294,57 +287,128 @@ void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx,
   CurrentToken.setIndentation(Indentation);
 }
 
+struct Tag {
+  enum class Kind {
+    None,
+    Normal, // {{...}}
+    Triple, // {{{...}}}
+  };
+
+  Kind TagKind = Kind::None;
+  StringRef Content;   // The content between the delimiters.
+  StringRef FullMatch; // The entire tag, including delimiters.
+  size_t StartPosition = StringRef::npos;
+};
+
+static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
+                       StringRef Close) {
+  const StringLiteral TripleOpen("{{{");
+  const StringLiteral TripleClose("}}}");
+
+  size_t NormalOpenPos = Template.find(Open, StartPos);
+  size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
+
+  Tag Result;
+
+  // Determine which tag comes first.
+  if (TripleOpenPos != StringRef::npos &&
+      (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
+    // Found a triple mustache tag.
+    size_t EndPos =
+        Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
+    if (EndPos == StringRef::npos)
+      return Result; // No closing tag found.
+
+    Result.TagKind = Tag::Kind::Triple;
+    Result.StartPosition = TripleOpenPos;
+    size_t ContentStart = TripleOpenPos + TripleOpen.size();
+    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
+    Result.FullMatch = Template.substr(
+        TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
+  } else if (NormalOpenPos != StringRef::npos) {
+    // Found a normal mustache tag.
+    size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
+    if (EndPos == StringRef::npos)
+      return Result; // No closing tag found.
+
+    Result.TagKind = Tag::Kind::Normal;
+    Result.StartPosition = NormalOpenPos;
+    size_t ContentStart = NormalOpenPos + Open.size();
+    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
+    Result.FullMatch =
+        Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
+  }
+
+  return Result;
+}
+
+static std::optional<std::pair<StringRef, StringRef>>
+processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
+  LLVM_DEBUG(dbgs() << "  Found tag: \"" << T.FullMatch << "\", Content: \""
+                    << T.Content << "\"\n");
+  if (T.TagKind == Tag::Kind::Triple) {
+    Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&');
+    LLVM_DEBUG(dbgs() << "  Created UnescapeVariable token.\n");
+    return std::nullopt;
+  }
+  StringRef Interpolated = T.Content;
+  std::string RawBody = T.FullMatch.str();
+  if (!Interpolated.trim().starts_with("=")) {
+    char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
+    Tokens.emplace_back(RawBody, Interpolated.str(), Front);
+    LLVM_DEBUG(dbgs() << "  Created tag token of type '" << Front << "'\n");
+    return std::nullopt;
+  }
+  Tokens.emplace_back(RawBody, Interpolated.str(), '=');
+  StringRef DelimSpec = Interpolated.trim();
+  DelimSpec = DelimSpec.drop_front(1);
+  DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
+  DelimSpec = DelimSpec.trim();
+
+  std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
+  LLVM_DEBUG(dbgs() << "  Found Set Delimiter tag. NewOpen='" << Ret.first
+                    << "', NewClose='" << Ret.second << "'\n");
+  return Ret;
+}
+
 // Simple tokenizer that splits the template into tokens.
 // The mustache spec allows {{{ }}} to unescape variables,
 // but we don't support that here. An unescape variable
 // is represented only by {{& variable}}.
-SmallVector<Token> tokenize(StringRef Template) {
+static SmallVector<Token> tokenize(StringRef Template) {
+  LLVM_DEBUG(dbgs() << "Tokenizing template: \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
-  StringLiteral Open("{{");
-  StringLiteral Close("}}");
-  StringLiteral TripleOpen("{{{");
-  StringLiteral TripleClose("}}}");
+  SmallString<8> Open("{{");
+  SmallString<8> Close("}}");
   size_t Start = 0;
-  size_t DelimiterStart = Template.find(Open);
-  if (DelimiterStart == StringRef::npos) {
-    Tokens.emplace_back(Template.str());
-    return Tokens;
-  }
-  while (DelimiterStart != StringRef::npos) {
-    if (DelimiterStart != Start)
-      Tokens.emplace_back(Template.substr(Start, DelimiterStart - Start).str());
-
-    if (Template.substr(DelimiterStart).starts_with(TripleOpen)) {
-      size_t DelimiterEnd = Template.find(TripleClose, DelimiterStart);
-      if (DelimiterEnd == StringRef::npos)
-        break;
-      size_t BodyStart = DelimiterStart + TripleOpen.size();
-      std::string Body =
-          Template.substr(BodyStart, DelimiterEnd - BodyStart).str();
-      std::string RawBody =
-          Template.substr(DelimiterStart, DelimiterEnd - DelimiterStart + 3)
-              .str();
-      Tokens.emplace_back(RawBody, "&" + Body, '&');
-      Start = DelimiterEnd + TripleClose.size();
-    } else {
-      size_t DelimiterEnd = Template.find(Close, DelimiterStart);
-      if (DelimiterEnd == StringRef::npos)
-        break;
-
-      // Extract the Interpolated variable without delimiters.
-      size_t InterpolatedStart = DelimiterStart + Open.size();
-      size_t InterpolatedEnd = DelimiterEnd - DelimiterStart - Close.size();
-      std::string Interpolated =
-          Template.substr(InterpolatedStart, InterpolatedEnd).str();
-      std::string RawBody = Open.str() + Interpolated + Close.str();
-      Tokens.emplace_back(RawBody, Interpolated, Interpolated[0]);
-      Start = DelimiterEnd + Close.size();
+
+  while (Start < Template.size()) {
+    LLVM_DEBUG(dbgs() << "Loop start. Start=" << Start << ", Open='" << Open
+                      << "', Close='" << Close << "'\n");
+    Tag T = findNextTag(Template, Start, Open, Close);
+
+    if (T.TagKind == Tag::Kind::None) {
+      // No more tags, the rest is text.
+      Tokens.emplace_back(Template.substr(Start).str());
+      LLVM_DEBUG(dbgs() << "  No more tags. Created final Text token: \""
+                        << Template.substr(Start) << "\"\n");
+      break;
+    }
+
+    // Add the text before the tag.
+    if (T.StartPosition > Start) {
+      StringRef Text = Template.substr(Start, T.StartPosition - Start);
+      Tokens.emplace_back(Text.str());
+      LLVM_DEBUG(dbgs() << "  Created Text token: \"" << Text << "\"\n");
     }
-    DelimiterStart = Template.find(Open, Start);
-  }
 
-  if (Start < Template.size())
-    Tokens.emplace_back(Template.substr(Start).str());
+    if (auto NewDelims = processTag(T, Tokens)) {
+      std::tie(Open, Close) = *NewDelims;
+    }
+
+    // Move past the tag.
+    Start = T.StartPosition + T.FullMatch.size();
+  }
 
   // Fix up white spaces for:
   //   - open sections
@@ -386,6 +450,7 @@ SmallVector<Token> tokenize(StringRef Template) {
     if ((!HasTextBehind && !HasTextAhead) || (!HasTextBehind && Idx == LastIdx))
       stripTokenBefore(Tokens, Idx, CurrentToken, CurrentType);
   }
+  LLVM_DEBUG(dbgs() << "Tokenizing finished.\n");
   return Tokens;
 }
 
@@ -468,39 +533,43 @@ private:
 
 class Parser {
 public:
-  Parser(StringRef TemplateStr) : TemplateStr(TemplateStr) {}
+  Parser(StringRef TemplateStr, MustacheContext &Ctx)
+      : Ctx(Ctx), TemplateStr(TemplateStr) {}
 
-  AstPtr parse(llvm::StringMap<AstPtr> &Partials,
-               llvm::StringMap<Lambda> &Lambdas,
-               llvm::StringMap<SectionLambda> &SectionLambdas,
-               EscapeMap &Escapes);
+  AstPtr parse();
 
 private:
-  void parseMustache(ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
-                     llvm::StringMap<Lambda> &Lambdas,
-                     llvm::StringMap<SectionLambda> &SectionLambdas,
-                     EscapeMap &Escapes);
+  void parseMustache(ASTNode *Parent);
+  void parseSection(ASTNode *Parent, ASTNode::Type Ty, const Accessor &A);
 
+  MustacheContext &Ctx;
   SmallVector<Token> Tokens;
   size_t CurrentPtr;
   StringRef TemplateStr;
 };
 
-AstPtr Parser::parse(llvm::StringMap<AstPtr> &Partials,
-                     llvm::StringMap<Lambda> &Lambdas,
-                     llvm::StringMap<SectionLambda> &SectionLambdas,
-                     EscapeMap &Escapes) {
+void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty,
+                          const Accessor &A) {
+  AstPtr CurrentNode = createNode(Ctx, Ty, A, Parent);
+  size_t Start = CurrentPtr;
+  parseMustache(CurrentNode.get());
+  const size_t End = CurrentPtr - 1;
+  std::string RawBody;
+  for (std::size_t I = Start; I < End; I++)
+    RawBody += Tokens[I].RawBody;
+  CurrentNode->setRawBody(std::move(RawBody));
+  Parent->addChild(std::move(CurrentNode));
+}
+
+AstPtr Parser::parse() {
   Tokens = tokenize(TemplateStr);
   CurrentPtr = 0;
-  AstPtr RootNode = createRootNode(Partials, Lambdas, SectionLambdas, Escapes);
-  parseMustache(RootNode.get(), Partials, Lambdas, SectionLambdas, Escapes);
+  AstPtr RootNode = createRootNode(Ctx);
+  parseMustache(RootNode.get());
   return RootNode;
 }
 
-void Parser::parseMustache(ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
-                           llvm::StringMap<Lambda> &Lambdas,
-                           llvm::StringMap<SectionLambda> &SectionLambdas,
-                           EscapeMap &Escapes) {
+void Parser::parseMustache(ASTNode *Parent) {
 
   while (CurrentPtr < Tokens.size()) {
     Token CurrentToken = Tokens[CurrentPtr];
@@ -510,66 +579,45 @@ void Parser::parseMustache(ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
 
     switch (CurrentToken.getType()) {
     case Token::Type::Text: {
-      CurrentNode = createTextNode(std::move(CurrentToken.TokenBody), Parent,
-                                   Partials, Lambdas, SectionLambdas, Escapes);
+      CurrentNode =
+          createTextNode(Ctx, std::move(CurrentToken.TokenBody), Parent);
       Parent->addChild(std::move(CurrentNode));
       break;
     }
     case Token::Type::Variable: {
-      CurrentNode = createNode(ASTNode::Variable, std::move(A), Parent,
-                               Partials, Lambdas, SectionLambdas, Escapes);
+      CurrentNode = createNode(Ctx, ASTNode::Variable, std::move(A), Parent);
       Parent->addChild(std::move(CurrentNode));
       break;
     }
     case Token::Type::UnescapeVariable: {
-      CurrentNode = createNode(ASTNode::UnescapeVariable, std::move(A), Parent,
-                               Partials, Lambdas, SectionLambdas, Escapes);
+      CurrentNode =
+          createNode(Ctx, ASTNode::UnescapeVariable, std::move(A), Parent);
       Parent->addChild(std::move(CurrentNode));
       break;
     }
     case Token::Type::Partial: {
-      CurrentNode = createNode(ASTNode::Partial, std::move(A), Parent, Partials,
-                               Lambdas, SectionLambdas, Escapes);
+      CurrentNode = createNode(Ctx, ASTNode::Partial, std::move(A), Parent);
       CurrentNode->setIndentation(CurrentToken.getIndentation());
       Parent->addChild(std::move(CurrentNode));
       break;
     }
     case Token::Type::SectionOpen: {
-      CurrentNode = createNode(ASTNode::Section, A, Parent, Partials, Lambdas,
-                               SectionLambdas, Escapes);
-      size_t Start = CurrentPtr;
-      parseMustache(CurrentNode.get(), Partials, Lambdas, SectionLambdas,
-                    Escapes);
-      const size_t End = CurrentPtr - 1;
-      std::string RawBody;
-      for (std::size_t I = Start; I < End; I++)
-        RawBody += Tokens[I].RawBody;
-      CurrentNode->setRawBody(std::move(RawBody));
-      Parent->addChild(std::move(CurrentNode));
+      parseSection(Parent, ASTNode::Section, A);
       break;
     }
     case Token::Type::InvertSectionOpen: {
-      CurrentNode = createNode(ASTNode::InvertSection, A, Parent, Partials,
-                               Lambdas, SectionLambdas, Escapes);
-      size_t Start = CurrentPtr;
-      parseMustache(CurrentNode.get(), Partials, Lambdas, SectionLambdas,
-                    Escapes);
-      const size_t End = CurrentPtr - 1;
-      std::string RawBody;
-      for (size_t Idx = Start; Idx < End; Idx++)
-        RawBody += Tokens[Idx].RawBody;
-      CurrentNode->setRawBody(std::move(RawBody));
-      Parent->addChild(std::move(CurrentNode));
+      parseSection(Parent, ASTNode::InvertSection, A);
       break;
     }
     case Token::Type::Comment:
+    case Token::Type::SetDelimiter:
       break;
     case Token::Type::SectionClose:
       return;
     }
   }
 }
-void toMustacheString(const json::Value &Data, raw_ostream &OS) {
+static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
   switch (Data.kind()) {
   case json::Value::Null:
     return;
@@ -601,74 +649,96 @@ void toMustacheString(const json::Value &Data, raw_ostream &OS) {
   }
 }
 
+void ASTNode::renderRoot(const json::Value &CurrentCtx, raw_ostream &OS) {
+  renderChild(CurrentCtx, OS);
+}
+
+void ASTNode::renderText(raw_ostream &OS) { OS << Body; }
+
+void ASTNode::renderPartial(const json::Value &CurrentCtx, raw_ostream &OS) {
+  auto Partial = Ctx.Partials.find(AccessorValue[0]);
+  if (Partial != Ctx.Partials.end())
+    renderPartial(CurrentCtx, OS, Partial->getValue().get());
+}
+
+void ASTNode::renderVariable(const json::Value &CurrentCtx, raw_ostream &OS) {
+  auto Lambda = Ctx.Lambdas.find(AccessorValue[0]);
+  if (Lambda != Ctx.Lambdas.end()) {
+    renderLambdas(CurrentCtx, OS, Lambda->getValue());
+  } else if (const json::Value *ContextPtr = findContext()) {
+    EscapeStringStream ES(OS, Ctx.Escapes);
+    toMustacheString(*ContextPtr, ES);
+  }
+}
+
+void ASTNode::renderUnescapeVariable(const json::Value &CurrentCtx,
+                                     raw_ostream &OS) {
+  auto Lambda = Ctx.Lambdas.find(AccessorValue[0]);
+  if (Lambda != Ctx.Lambdas.end()) {
+    renderLambdas(CurrentCtx, OS, Lambda->getValue());
+  } else if (const json::Value *ContextPtr = findContext()) {
+    toMustacheString(*ContextPtr, OS);
+  }
+}
+
+void ASTNode::renderSection(const json::Value &CurrentCtx, raw_ostream &OS) {
+  auto SectionLambda = Ctx.SectionLambdas.find(AccessorValue[0]);
+  if (SectionLambda != Ctx.SectionLambdas.end()) {
+    renderSectionLambdas(CurrentCtx, OS, SectionLambda->getValue());
+    return;
+  }
+
+  const json::Value *ContextPtr = findContext();
+  if (isContextFalsey(ContextPtr))
+    return;
+
+  if (const json::Array *Arr = ContextPtr->getAsArray()) {
+    for (const json::Value &V : *Arr)
+      renderChild(V, OS);
+    return;
+  }
+  renderChild(*ContextPtr, OS);
+}
+
+void ASTNode::renderInvertSection(const json::Value &CurrentCtx,
+                                  raw_ostream &OS) {
+  bool IsLambda = Ctx.SectionLambdas.contains(AccessorValue[0]);
+  const json::Value *ContextPtr = findContext();
+  if (isContextFalsey(ContextPtr) && !IsLambda) {
+    renderChild(CurrentCtx, OS);
+  }
+}
+
 void ASTNode::render(const json::Value &CurrentCtx, raw_ostream &OS) {
+  if (Ty != Root && Ty != Text && AccessorValue.empty())
+    return;
   // Set the parent context to the incoming context so that we
   // can walk up the context tree correctly in findContext().
   ParentContext = &CurrentCtx;
-  const json::Value *ContextPtr = Ty == Root ? ParentContext : findContext();
 
   switch (Ty) {
   case Root:
-    renderChild(CurrentCtx, OS);
+    renderRoot(CurrentCtx, OS);
     return;
   case Text:
-    OS << Body;
+    renderText(OS);
     return;
-  case Partial: {
-    auto Partial = Partials.find(AccessorValue[0]);
-    if (Partial != Partials.end())
-      renderPartial(CurrentCtx, OS, Partial->getValue().get());
+  case Partial:
+    renderPartial(CurrentCtx, OS);
     return;
-  }
-  case Variable: {
-    auto Lambda = Lambdas.find(AccessorValue[0]);
-    if (Lambda != Lambdas.end()) {
-      renderLambdas(CurrentCtx, OS, Lambda->getValue());
-    } else if (ContextPtr) {
-      EscapeStringStream ES(OS, Escapes);
-      toMustacheString(*ContextPtr, ES);
-    }
+  case Variable:
+    renderVariable(CurrentCtx, OS);
     return;
-  }
-  case UnescapeVariable: {
-    auto Lambda = Lambdas.find(AccessorValue[0]);
-    if (Lambda != Lambdas.end()) {
-      renderLambdas(CurrentCtx, OS, Lambda->getValue());
-    } else if (ContextPtr) {
-      toMustacheString(*ContextPtr, OS);
-    }
+  case UnescapeVariable:
+    renderUnescapeVariable(CurrentCtx, OS);
     return;
-  }
-  case Section: {
-    auto SectionLambda = SectionLambdas.find(AccessorValue[0]);
-    bool IsLambda = SectionLambda != SectionLambdas.end();
-
-    if (IsLambda) {
-      renderSectionLambdas(CurrentCtx, OS, SectionLambda->getValue());
-      return;
-    }
-
-    if (isContextFalsey(ContextPtr))
-      return;
-
-    if (const json::Array *Arr = ContextPtr->getAsArray()) {
-      for (const json::Value &V : *Arr)
-        renderChild(V, OS);
-      return;
-    }
-    renderChild(*ContextPtr, OS);
+  case Section:
+    renderSection(CurrentCtx, OS);
     return;
-  }
-  case InvertSection: {
-    bool IsLambda = SectionLambdas.contains(AccessorValue[0]);
-    if (isContextFalsey(ContextPtr) && !IsLambda) {
-      // The context for the children remains unchanged from the parent's, so
-      // we pass this node's original incoming context.
-      renderChild(CurrentCtx, OS);
-    }
+  case InvertSection:
+    renderInvertSection(CurrentCtx, OS);
     return;
   }
-  }
   llvm_unreachable("Invalid ASTNode type");
 }
 
@@ -728,10 +798,10 @@ void ASTNode::renderLambdas(const json::Value &Contexts, llvm::raw_ostream &OS,
   std::string LambdaStr;
   raw_string_ostream Output(LambdaStr);
   toMustacheString(LambdaResult, Output);
-  Parser P = Parser(LambdaStr);
-  AstPtr LambdaNode = P.parse(Partials, Lambdas, SectionLambdas, Escapes);
+  Parser P(LambdaStr, Ctx);
+  AstPtr LambdaNode = P.parse();
 
-  EscapeStringStream ES(OS, Escapes);
+  EscapeStringStream ES(OS, Ctx.Escapes);
   if (Ty == Variable) {
     LambdaNode->render(Contexts, ES);
     return;
@@ -747,8 +817,8 @@ void ASTNode::renderSectionLambdas(const json::Value &Contexts,
   std::string LambdaStr;
   raw_string_ostream Output(LambdaStr);
   toMustacheString(Return, Output);
-  Parser P = Parser(LambdaStr);
-  AstPtr LambdaNode = P.parse(Partials, Lambdas, SectionLambdas, Escapes);
+  Parser P(LambdaStr, Ctx);
+  AstPtr LambdaNode = P.parse();
   LambdaNode->render(Contexts, OS);
 }
 
@@ -757,22 +827,26 @@ void Template::render(const json::Value &Data, llvm::raw_ostream &OS) {
 }
 
 void Template::registerPartial(std::string Name, std::string Partial) {
-  Parser P = Parser(Partial);
-  AstPtr PartialTree = P.parse(Partials, Lambdas, SectionLambdas, Escapes);
-  Partials.insert(std::make_pair(Name, std::move(PartialTree)));
+  Parser P(Partial, Ctx);
+  AstPtr PartialTree = P.parse();
+  Ctx.Partials.insert(std::make_pair(Name, std::move(PartialTree)));
 }
 
-void Template::registerLambda(std::string Name, Lambda L) { Lambdas[Name] = L; }
+void Template::registerLambda(std::string Name, Lambda L) {
+  Ctx.Lambdas[Name] = L;
+}
 
 void Template::registerLambda(std::string Name, SectionLambda L) {
-  SectionLambdas[Name] = L;
+  Ctx.SectionLambdas[Name] = L;
 }
 
-void Template::overrideEscapeCharacters(EscapeMap E) { Escapes = std::move(E); }
+void Template::overrideEscapeCharacters(EscapeMap E) {
+  Ctx.Escapes = std::move(E);
+}
 
 Template::Template(StringRef TemplateStr) {
-  Parser P = Parser(TemplateStr);
-  Tree = P.parse(Partials, Lambdas, SectionLambdas, Escapes);
+  Parser P(TemplateStr, Ctx);
+  Tree = P.parse();
   // The default behavior is to escape html entities.
   const EscapeMap HtmlEntities = {{'&', "&amp;"},
                                   {'<', "&lt;"},
@@ -783,21 +857,18 @@ Template::Template(StringRef TemplateStr) {
 }
 
 Template::Template(Template &&Other) noexcept
-    : Partials(std::move(Other.Partials)), Lambdas(std::move(Other.Lambdas)),
-      SectionLambdas(std::move(Other.SectionLambdas)),
-      Escapes(std::move(Other.Escapes)), Tree(std::move(Other.Tree)) {}
+    : Ctx(std::move(Other.Ctx)), Tree(std::move(Other.Tree)) {}
 
 Template::~Template() = default;
 
 Template &Template::operator=(Template &&Other) noexcept {
   if (this != &Other) {
-    Partials = std::move(Other.Partials);
-    Lambdas = std::move(Other.Lambdas);
-    SectionLambdas = std::move(Other.SectionLambdas);
-    Escapes = std::move(Other.Escapes);
+    Ctx = std::move(Other.Ctx);
     Tree = std::move(Other.Tree);
     Other.Tree = nullptr;
   }
   return *this;
 }
 } // namespace llvm::mustache
+
+#undef DEBUG_TYPE
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 899baa9..45f5235 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18867,21 +18867,25 @@ performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
     return SDValue();
 
-  unsigned NumUses = N->use_size();
+  // Count the number of users which are extract_vectors.
+  unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
+    return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
+  });
+
   auto MaskEC = N->getValueType(0).getVectorElementCount();
-  if (!MaskEC.isKnownMultipleOf(NumUses))
+  if (!MaskEC.isKnownMultipleOf(NumExts))
     return SDValue();
 
-  ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumUses);
+  ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
   if (ExtMinEC.getKnownMinValue() < 2)
     return SDValue();
 
-  SmallVector<SDNode *> Extracts(NumUses, nullptr);
+  SmallVector<SDNode *> Extracts(NumExts, nullptr);
   for (SDNode *Use : N->users()) {
     if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
-      return SDValue();
+      continue;
 
-    // Ensure the extract type is correct (e.g. if NumUses is 4 and
+    // Ensure the extract type is correct (e.g. if NumExts is 4 and
     // the mask return type is nxv8i1, each extract should be nxv2i1.
     if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
       return SDValue();
@@ -18902,32 +18906,39 @@ performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
 
   SDValue Idx = N->getOperand(0);
   SDValue TC = N->getOperand(1);
-  EVT OpVT = Idx.getValueType();
-  if (OpVT != MVT::i64) {
+  if (Idx.getValueType() != MVT::i64) {
     Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
     TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
   }
 
   // Create the whilelo_x2 intrinsics from each pair of extracts
   EVT ExtVT = Extracts[0]->getValueType(0);
+  EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
   auto R =
       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
   DCI.CombineTo(Extracts[0], R.getValue(0));
   DCI.CombineTo(Extracts[1], R.getValue(1));
+  SmallVector<SDValue> Concats = {DAG.getNode(
+      ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
 
-  if (NumUses == 2)
-    return SDValue(N, 0);
+  if (NumExts == 2) {
+    assert(N->getValueType(0) == DoubleExtVT);
+    return Concats[0];
+  }
 
-  auto Elts = DAG.getElementCount(DL, OpVT, ExtVT.getVectorElementCount() * 2);
-  for (unsigned I = 2; I < NumUses; I += 2) {
+  auto Elts =
+      DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
+  for (unsigned I = 2; I < NumExts; I += 2) {
     // After the first whilelo_x2, we need to increment the starting value.
-    Idx = DAG.getNode(ISD::UADDSAT, DL, OpVT, Idx, Elts);
+    Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
     R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
     DCI.CombineTo(Extracts[I], R.getValue(0));
     DCI.CombineTo(Extracts[I + 1], R.getValue(1));
+    Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
+                                  R.getValue(0), R.getValue(1)));
   }
 
-  return SDValue(N, 0);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
 }
 
 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
@@ -25512,6 +25523,32 @@ SDValue performCONDCombine(SDNode *N,
                                              CmpIndex, CC))
     return Val;
 
+  // X & M ?= C --> (C << clz(M)) ?= (X << clz(M)) where M is a non-empty
+  // sequence of ones starting at the least significant bit with the remainder
+  // zero and C is a constant s.t. (C & ~M) == 0 that cannot be materialised
+  // into a SUBS (immediate). The transformed form can be matched into a SUBS
+  // (shifted register).
+  if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && AndNode->hasOneUse() &&
+      isa<ConstantSDNode>(AndNode->getOperand(1)) &&
+      isa<ConstantSDNode>(SubsNode->getOperand(1))) {
+    SDValue X = AndNode->getOperand(0);
+    APInt M = AndNode->getConstantOperandAPInt(1);
+    APInt C = SubsNode->getConstantOperandAPInt(1);
+
+    if (M.isMask() && C.isSubsetOf(M) && !isLegalArithImmed(C.getZExtValue())) {
+      SDLoc DL(SubsNode);
+      EVT VT = SubsNode->getValueType(0);
+      unsigned ShiftAmt = M.countl_zero();
+      SDValue ShiftedX = DAG.getNode(
+          ISD::SHL, DL, VT, X, DAG.getShiftAmountConstant(ShiftAmt, VT, DL));
+      SDValue ShiftedC = DAG.getConstant(C << ShiftAmt, DL, VT);
+      SDValue NewSubs = DAG.getNode(AArch64ISD::SUBS, DL, SubsNode->getVTList(),
+                                    ShiftedC, ShiftedX);
+      DCI.CombineTo(SubsNode, NewSubs, NewSubs.getValue(1));
+      return SDValue(N, 0);
+    }
+  }
+
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
     uint32_t CNV = CN->getZExtValue();
     if (CNV == 255)
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 7947469..09b3643 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -541,6 +541,13 @@ void AArch64PrologueEmitter::emitPrologue() {
   // to determine the end of the prologue.
   DebugLoc DL;
 
+  // In some cases, particularly with CallingConv::SwiftTail, it is possible to
+  // have a tail-call where the caller only needs to adjust the stack pointer in
+  // the epilogue. In this case, we still need to emit a SEH prologue sequence.
+  // See `seh-minimal-prologue-epilogue.ll` test cases.
+  if (AFI->getArgumentStackToRestore())
+    HasWinCFI = true;
+
   if (AFI->shouldSignReturnAddress(MF)) {
     // If pac-ret+leaf is in effect, PAUTH_PROLOGUE pseudo instructions
     // are inserted by emitPacRetPlusLeafHardening().
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index cced0fa..4749748 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -22,7 +22,7 @@
 // To handle ZA state across control flow, we make use of edge bundling. This
 // assigns each block an "incoming" and "outgoing" edge bundle (representing
 // incoming and outgoing edges). Initially, these are unique to each block;
-// then, in the process of forming bundles, the outgoing block of a block is
+// then, in the process of forming bundles, the outgoing bundle of a block is
 // joined with the incoming bundle of all successors. The result is that each
 // bundle can be assigned a single ZA state, which ensures the state required by
 // all a blocks' successors is the same, and that each basic block will always
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 0776d14..f413bbc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -840,7 +840,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
   // clang-format on
 
-  addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB)
+  addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
+                    G_AMDGPU_TBUFFER_LOAD_FORMAT},
+                   StandardB)
       .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
       .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
       .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 31a2d55..c2252af 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1006,9 +1006,8 @@ public:
            Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
            Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
            Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
-           Opcode == AMDGPU::S_BARRIER_LEAVE ||
-           Opcode == AMDGPU::S_BARRIER_LEAVE_IMM ||
-           Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER;
+           Opcode == AMDGPU::S_BARRIER_LEAVE || Opcode == AMDGPU::DS_GWS_INIT ||
+           Opcode == AMDGPU::DS_GWS_BARRIER;
   }
 
   static bool isF16PseudoScalarTrans(unsigned Opcode) {
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 296ce5a..b3fd8c7 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1616,7 +1616,8 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
   let isConvergent = 1;
 }
 
-def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> {
+  def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave",
+    (ins), "", [(int_amdgcn_s_barrier_leave (i16 srcvalue))] > {
   let SchedRW = [WriteBarrier];
   let simm16 = 0;
   let fixed_imm = 1;
@@ -1624,9 +1625,6 @@ def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> {
   let Defs = [SCC];
 }
 
-def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave",
-    (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>;
-
 def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
   let SubtargetPredicate = isGFX8Plus;
   let simm16 = 0;
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index e94220a..2e8a676 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -960,17 +960,3 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
   }
   return false;
 }
-
-bool ARMBaseRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
-                                               unsigned DefSubReg,
-                                               const TargetRegisterClass *SrcRC,
-                                               unsigned SrcSubReg) const {
-  // We can't extract an SPR from an arbitary DPR (as opposed to a DPR_VFP2).
-  if (DefRC == &ARM::SPRRegClass && DefSubReg == 0 &&
-      SrcRC == &ARM::DPRRegClass &&
-      (SrcSubReg == ARM::ssub_0 || SrcSubReg == ARM::ssub_1))
-    return false;
-
-  return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
-                                                  SrcRC, SrcSubReg);
-}
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 5b67b34..03b0fa0 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -158,11 +158,6 @@ public:
                       const TargetRegisterClass *NewRC,
                       LiveIntervals &LIS) const override;
 
-  bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
-                            unsigned DefSubReg,
-                            const TargetRegisterClass *SrcRC,
-                            unsigned SrcSubReg) const override;
-
   int getSEHRegNum(unsigned i) const { return getEncodingValue(i); }
 };
 
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 5be4713..9b11201 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -957,8 +957,10 @@ void LoongArchAsmParser::emitLoadAddressAbs(MCInst &Inst, SMLoc IDLoc,
                              : Inst.getOperand(2).getExpr();
   InstSeq Insts;
 
+  // To distinguish between la.abs and %abs_hi20, la.abs will generate
+  // R_LARCH_MARK_LA and R_LARCH_ABS_HI20 relocations.
   Insts.push_back(
-      LoongArchAsmParser::Inst(LoongArch::LU12I_W, ELF::R_LARCH_ABS_HI20));
+      LoongArchAsmParser::Inst(LoongArch::LU12I_W, ELF::R_LARCH_MARK_LA));
   Insts.push_back(
       LoongArchAsmParser::Inst(LoongArch::ORI, ELF::R_LARCH_ABS_LO12));
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 098bcfa..4cfbfca 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2319,6 +2319,53 @@ static SDValue lowerVECTOR_SHUFFLE_XVPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
   return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
 }
 
+/// Lower VECTOR_SHUFFLE into XVINSVE0 (if possible).
+static SDValue
+lowerVECTOR_SHUFFLE_XVINSVE0(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                             SDValue V1, SDValue V2, SelectionDAG &DAG,
+                             const LoongArchSubtarget &Subtarget) {
+  // LoongArch LASX only supports xvinsve0.{w/d}.
+  if (VT != MVT::v8i32 && VT != MVT::v8f32 && VT != MVT::v4i64 &&
+      VT != MVT::v4f64)
+    return SDValue();
+
+  MVT GRLenVT = Subtarget.getGRLenVT();
+  int MaskSize = Mask.size();
+  assert(MaskSize == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  // Check if exactly one element of the Mask is replaced by 'Replaced', while
+  // all other elements are either 'Base + i' or undef (-1). On success, return
+  // the index of the replaced element. Otherwise, just return -1.
+  auto checkReplaceOne = [&](int Base, int Replaced) -> int {
+    int Idx = -1;
+    for (int i = 0; i < MaskSize; ++i) {
+      if (Mask[i] == Base + i || Mask[i] == -1)
+        continue;
+      if (Mask[i] != Replaced)
+        return -1;
+      if (Idx == -1)
+        Idx = i;
+      else
+        return -1;
+    }
+    return Idx;
+  };
+
+  // Case 1: the lowest element of V2 replaces one element in V1.
+  int Idx = checkReplaceOne(0, MaskSize);
+  if (Idx != -1)
+    return DAG.getNode(LoongArchISD::XVINSVE0, DL, VT, V1, V2,
+                       DAG.getConstant(Idx, DL, GRLenVT));
+
+  // Case 2: the lowest element of V1 replaces one element in V2.
+  Idx = checkReplaceOne(MaskSize, 0);
+  if (Idx != -1)
+    return DAG.getNode(LoongArchISD::XVINSVE0, DL, VT, V2, V1,
+                       DAG.getConstant(Idx, DL, GRLenVT));
+
+  return SDValue();
+}
+
 /// Lower VECTOR_SHUFFLE into XVSHUF (if possible).
 static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
                                           MVT VT, SDValue V1, SDValue V2,
@@ -2595,6 +2642,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget,
                                            Zeroable)))
     return Result;
+  if ((Result =
+           lowerVECTOR_SHUFFLE_XVINSVE0(DL, Mask, VT, V1, V2, DAG, Subtarget)))
+    return Result;
   if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG,
                                                 Subtarget)))
     return Result;
@@ -7453,6 +7503,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(XVPERM)
     NODE_NAME_CASE(XVREPLVE0)
     NODE_NAME_CASE(XVREPLVE0Q)
+    NODE_NAME_CASE(XVINSVE0)
     NODE_NAME_CASE(VPICK_SEXT_ELT)
     NODE_NAME_CASE(VPICK_ZEXT_ELT)
     NODE_NAME_CASE(VREPLVE)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 9b60a9f..8a4d774 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -151,6 +151,7 @@ enum NodeType : unsigned {
   XVPERM,
   XVREPLVE0,
   XVREPLVE0Q,
+  XVINSVE0,
 
   // Extended vector element extraction
   VPICK_SEXT_ELT,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index bbc0489..5143d53 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -20,6 +20,7 @@ def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>;
 def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>;
 def loongarch_xvreplve0: SDNode<"LoongArchISD::XVREPLVE0", SDT_LoongArchXVREPLVE0>;
 def loongarch_xvreplve0q: SDNode<"LoongArchISD::XVREPLVE0Q", SDT_LoongArchXVREPLVE0>;
+def loongarch_xvinsve0 : SDNode<"LoongArchISD::XVINSVE0", SDT_LoongArchV2RUimm>;
 def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>;
@@ -1708,6 +1709,14 @@ def : Pat<(vector_insert v4f64:$xd, (f64(bitconvert i64:$rj)), uimm2:$imm),
           (XVINSGR2VR_D v4f64:$xd, GPR:$rj, uimm2:$imm)>;
 
 // XVINSVE0_{W/D}
+def : Pat<(loongarch_xvinsve0 v8i32:$xd, v8i32:$xj, uimm3:$imm),
+          (XVINSVE0_W v8i32:$xd, v8i32:$xj, uimm3:$imm)>;
+def : Pat<(loongarch_xvinsve0 v4i64:$xd, v4i64:$xj, uimm2:$imm),
+          (XVINSVE0_D v4i64:$xd, v4i64:$xj, uimm2:$imm)>;
+def : Pat<(loongarch_xvinsve0 v8f32:$xd, v8f32:$xj, uimm3:$imm),
+          (XVINSVE0_W v8f32:$xd, v8f32:$xj, uimm3:$imm)>;
+def : Pat<(loongarch_xvinsve0 v4f64:$xd, v4f64:$xj, uimm2:$imm),
+          (XVINSVE0_D v4f64:$xd, v4f64:$xj, uimm2:$imm)>;
 def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
           (XVINSVE0_W v8f32:$xd, (SUBREG_TO_REG(i64 0), FPR32:$fj, sub_32),
               uimm3:$imm)>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
index 0d77617..8ecb62d 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
@@ -32,6 +32,7 @@ static StringRef getLoongArchSpecifierName(uint16_t S) {
     return "b16";
   case ELF::R_LARCH_B21:
     return "b21";
+  case ELF::R_LARCH_MARK_LA:
   case ELF::R_LARCH_ABS_HI20:
     return "abs_hi20";
   case ELF::R_LARCH_ABS_LO12:
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index b7ead5e..f0e2bc4 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -161,6 +161,13 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case ELF::R_LARCH_B26:
       FixupKind = LoongArch::fixup_loongarch_b26;
       break;
+    case ELF::R_LARCH_MARK_LA:
+      // Match gas behavior: generate `R_LARCH_MARK_LA` relocation when using
+      // `la.abs`.
+      Fixups.push_back(
+          MCFixup::create(0, MCConstantExpr::create(0, Ctx),
+                          FirstLiteralRelocationKind + ELF::R_LARCH_MARK_LA));
+      [[fallthrough]];
     case ELF::R_LARCH_ABS_HI20:
       FixupKind = LoongArch::fixup_loongarch_abs_hi20;
       break;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index c3ab965..1aefea1 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -182,10 +182,113 @@ class XX3Form_XTAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31} = XT{5};
 }
 
+class XX3Form_XTAB6_S<bits<5> xo, dag OOL, dag IOL, string asmstr,
+                       list<dag> pattern>
+    : I<59, OOL, IOL, asmstr, NoItinerary> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = XT{4...0};
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{24...28} = xo;
+  let Inst{29} = XA{5};
+  let Inst{30} = XB{5};
+  let Inst{31} = XT{5};
+}
+
+class XX3Form_XTAB6_S3<bits<5> xo, dag OOL, dag IOL, string asmstr,
+                       list<dag> pattern>
+    : XX3Form_XTAB6_S<xo, OOL, IOL, asmstr, pattern> {
+
+  bits<3> S;
+  let Inst{21...23} = S;
+}
+
+class XX3Form_XTAB6_3S1<bits<5> xo, dag OOL, dag IOL, string asmstr,
+                       list<dag> pattern>
+    : XX3Form_XTAB6_S<xo, OOL, IOL, asmstr, pattern> {
+
+  bits<1> S0;
+  bits<1> S1;
+  bits<1> S2;
+
+  let Inst{21} = S0;
+  let Inst{22} = S1;
+  let Inst{23} = S2;
+}
+
+class XX3Form_XTAB6_2S1<bits<5> xo, dag OOL, dag IOL, string asmstr,
+                       list<dag> pattern>
+    : XX3Form_XTAB6_S<xo, OOL, IOL, asmstr, pattern> {
+
+  bits<1> S1;
+  bits<1> S2;
+
+  let Inst{21} = 0;
+  let Inst{22} = S1;
+  let Inst{23} = S2;
+}
+
+class XX3Form_XTAB6_P<bits<7> xo, dag OOL, dag IOL, string asmstr,
+                      list<dag> pattern>
+    : I<59, OOL, IOL, asmstr, NoItinerary> {
+
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<1> P;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = XT{4...0};
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{21} = P;
+  let Inst{22...28} = xo;
+  let Inst{29} = XA{5};
+  let Inst{30} = XB{5};
+  let Inst{31} = XT{5};
+}
+
+// Prefix instruction classes.
+
+class 8RR_XX4Form_XTABC6_P<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                           InstrItinClass itin, list<dag> pattern>
+    : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<6> XC;
+  bits<1> P;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6...7} = 1;
+  let Inst{8...11} = 0;
+
+  // The instruction.
+  let Inst{38...42} = XT{4...0};
+  let Inst{43...47} = XA{4...0};
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...57} = XC{4...0};
+  let Inst{58} = 1;
+  let Inst{59} = P;
+  let Inst{60} = XC{5};
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = XT{5};
+}
+
 //-------------------------- Instruction definitions -------------------------//
 // Predicate combinations available:
 // [IsISAFuture]
 // [HasVSX, IsISAFuture]
+// [HasVSX, PrefixInstrs, IsISAFuture]
 
 let Predicates = [IsISAFuture] in {
   defm SUBFUS : XOForm_RTAB5_L1r<31, 72, (outs g8rc:$RT),
@@ -294,6 +397,78 @@ let Predicates = [HasVSX, IsISAFuture] in {
                               "xvmulhuw $XT, $XA, $XB", []>;
   def XVMULHUH: XX3Form_XTAB6<60, 122, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                               "xvmulhuh $XT, $XA, $XB", []>;
+
+  // Elliptic Curve Cryptography Acceleration Instructions.
+  def XXMULMUL
+      : XX3Form_XTAB6_S3<1, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u3imm:$S),
+                         "xxmulmul $XT, $XA, $XB, $S", []>;
+  def XXMULMULHIADD
+      : XX3Form_XTAB6_3S1<9, (outs vsrc:$XT),
+                          (ins vsrc:$XA, vsrc:$XB, u1imm:$S0, u1imm:$S1,
+                              u1imm:$S2),
+                          "xxmulmulhiadd $XT, $XA, $XB, $S0, $S1, $S2", []>;
+  def XXMULMULLOADD
+      : XX3Form_XTAB6_2S1<17, (outs vsrc:$XT),
+                          (ins vsrc:$XA, vsrc:$XB, u1imm:$S1, u1imm:$S2),
+                          "xxmulmulloadd $XT, $XA, $XB, $S1, $S2", []>;
+  def XXSSUMUDM
+      : XX3Form_XTAB6_P<25, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u1imm:$P),
+                        "xxssumudm $XT, $XA, $XB, $P", []>;
+  def XXSSUMUDMC
+      : XX3Form_XTAB6_P<57, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u1imm:$P),
+                        "xxssumudmc $XT, $XA, $XB, $P", []>;
+  def XSADDADDUQM
+      : XX3Form_XTAB6<59, 96, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsaddadduqm $XT, $XA, $XB", []>;
+  def XSADDADDSUQM
+      : XX3Form_XTAB6<59, 104, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsaddaddsuqm $XT, $XA, $XB", []>;
+  def XSADDSUBUQM
+      : XX3Form_XTAB6<59, 112, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsaddsubuqm $XT, $XA, $XB", []>;
+  def XSADDSUBSUQM
+      : XX3Form_XTAB6<59, 224, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsaddsubsuqm $XT, $XA, $XB", []>;
+  def XSMERGE2T1UQM
+      : XX3Form_XTAB6<59, 232, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsmerge2t1uqm $XT, $XA, $XB", []>;
+  def XSMERGE2T2UQM
+      : XX3Form_XTAB6<59, 240, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsmerge2t2uqm $XT, $XA, $XB", []>;
+  def XSMERGE2T3UQM
+      : XX3Form_XTAB6<59, 89, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsmerge2t3uqm $XT, $XA, $XB", []>;
+  def XSMERGE3T1UQM
+      : XX3Form_XTAB6<59, 121, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsmerge3t1uqm $XT, $XA, $XB", []>;
+  def XSREBASE2T1UQM
+      : XX3Form_XTAB6<59, 145, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase2t1uqm $XT, $XA, $XB", []>;
+  def XSREBASE2T2UQM
+      : XX3Form_XTAB6<59, 177, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase2t2uqm $XT, $XA, $XB", []>;
+  def XSREBASE2T3UQM
+      : XX3Form_XTAB6<59, 209, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase2t3uqm $XT, $XA, $XB", []>;
+  def XSREBASE2T4UQM
+      : XX3Form_XTAB6<59, 217, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase2t4uqm $XT, $XA, $XB", []>;
+  def XSREBASE3T1UQM
+      : XX3Form_XTAB6<59, 241, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase3t1uqm $XT, $XA, $XB", []>;
+  def XSREBASE3T2UQM
+      : XX3Form_XTAB6<59, 249, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase3t2uqm $XT, $XA, $XB", []>;
+  def XSREBASE3T3UQM
+      : XX3Form_XTAB6<59, 195, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase3t3uqm $XT, $XA, $XB", []>;
+}
+
+let Predicates = [HasVSX, PrefixInstrs, IsISAFuture] in {
+  def XXSSUMUDMCEXT
+      : 8RR_XX4Form_XTABC6_P<
+            34, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC, u1imm:$P),
+            "xxssumudmcext $XT, $XA, $XB, $XC, $P", IIC_VecGeneral, []>;
 }
 
 //---------------------------- Anonymous Patterns ----------------------------//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 6d418fd..70b6c7e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1023,6 +1023,37 @@ static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target,
   Cond.push_back(LastInst.getOperand(1));
 }
 
+static unsigned getInverseXqcicmOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  case RISCV::QC_MVEQ:
+    return RISCV::QC_MVNE;
+  case RISCV::QC_MVNE:
+    return RISCV::QC_MVEQ;
+  case RISCV::QC_MVLT:
+    return RISCV::QC_MVGE;
+  case RISCV::QC_MVGE:
+    return RISCV::QC_MVLT;
+  case RISCV::QC_MVLTU:
+    return RISCV::QC_MVGEU;
+  case RISCV::QC_MVGEU:
+    return RISCV::QC_MVLTU;
+  case RISCV::QC_MVEQI:
+    return RISCV::QC_MVNEI;
+  case RISCV::QC_MVNEI:
+    return RISCV::QC_MVEQI;
+  case RISCV::QC_MVLTI:
+    return RISCV::QC_MVGEI;
+  case RISCV::QC_MVGEI:
+    return RISCV::QC_MVLTI;
+  case RISCV::QC_MVLTUI:
+    return RISCV::QC_MVGEUI;
+  case RISCV::QC_MVGEUI:
+    return RISCV::QC_MVLTUI;
+  }
+}
+
 unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC, unsigned SelectOpc) {
   switch (SelectOpc) {
   default:
@@ -3762,6 +3793,19 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
       return false;
     // Operands 1 and 2 are commutable, if we switch the opcode.
     return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+  case RISCV::QC_MVEQ:
+  case RISCV::QC_MVNE:
+  case RISCV::QC_MVLT:
+  case RISCV::QC_MVGE:
+  case RISCV::QC_MVLTU:
+  case RISCV::QC_MVGEU:
+  case RISCV::QC_MVEQI:
+  case RISCV::QC_MVNEI:
+  case RISCV::QC_MVLTI:
+  case RISCV::QC_MVGEI:
+  case RISCV::QC_MVLTUI:
+  case RISCV::QC_MVGEUI:
+    return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 4);
   case RISCV::TH_MULA:
   case RISCV::TH_MULAW:
   case RISCV::TH_MULAH:
@@ -3974,6 +4018,23 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1,
                                                    OpIdx2);
   }
+  case RISCV::QC_MVEQ:
+  case RISCV::QC_MVNE:
+  case RISCV::QC_MVLT:
+  case RISCV::QC_MVGE:
+  case RISCV::QC_MVLTU:
+  case RISCV::QC_MVGEU:
+  case RISCV::QC_MVEQI:
+  case RISCV::QC_MVNEI:
+  case RISCV::QC_MVLTI:
+  case RISCV::QC_MVGEI:
+  case RISCV::QC_MVLTUI:
+  case RISCV::QC_MVGEUI: {
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(getInverseXqcicmOpcode(MI.getOpcode())));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1,
+                                                   OpIdx2);
+  }
   case RISCV::PseudoCCMOVGPRNoX0:
   case RISCV::PseudoCCMOVGPR: {
     // CCMOV can be commuted by inverting the condition.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 13b02d1..ff4a040 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -604,7 +604,7 @@ class QCILICC<bits<3> funct3, bits<2> funct2, DAGOperand InTyRs2, string opcodes
   let Inst{31-25} = {simm, funct2};
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
 class QCIMVCC<bits<3> funct3, string opcodestr>
     : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
                (ins GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs3),
@@ -612,7 +612,7 @@ class QCIMVCC<bits<3> funct3, string opcodestr>
   let Constraints = "$rd = $rd_wb";
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
 class QCIMVCCI<bits<3> funct3, string opcodestr, DAGOperand immType>
     : RVInstR4<0b10, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
                (ins GPRNoX0:$rd, GPRNoX0:$rs1, immType:$imm, GPRNoX0:$rs3),
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index c2a6e51..b765fec 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -81,6 +81,7 @@ public:
   void outputExecutionMode(const Module &M);
   void outputAnnotations(const Module &M);
   void outputModuleSections();
+  void outputFPFastMathDefaultInfo();
   bool isHidden() {
     return MF->getFunction()
         .getFnAttribute(SPIRV_BACKEND_SERVICE_FUN_NAME)
@@ -498,11 +499,27 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
   NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode");
   if (Node) {
     for (unsigned i = 0; i < Node->getNumOperands(); i++) {
+      // If SPV_KHR_float_controls2 is enabled and we find any of
+      // FPFastMathDefault, ContractionOff or SignedZeroInfNanPreserve execution
+      // modes, skip it, it'll be done somewhere else.
+      if (ST->canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2)) {
+        const auto EM =
+            cast<ConstantInt>(
+                cast<ConstantAsMetadata>((Node->getOperand(i))->getOperand(1))
+                    ->getValue())
+                ->getZExtValue();
+        if (EM == SPIRV::ExecutionMode::FPFastMathDefault ||
+            EM == SPIRV::ExecutionMode::ContractionOff ||
+            EM == SPIRV::ExecutionMode::SignedZeroInfNanPreserve)
+          continue;
+      }
+
       MCInst Inst;
       Inst.setOpcode(SPIRV::OpExecutionMode);
       addOpsFromMDNode(cast<MDNode>(Node->getOperand(i)), Inst, MAI);
       outputMCInst(Inst);
     }
+    outputFPFastMathDefaultInfo();
   }
   for (auto FI = M.begin(), E = M.end(); FI != E; ++FI) {
     const Function &F = *FI;
@@ -552,12 +569,84 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
     }
     if (ST->isKernel() && !M.getNamedMetadata("spirv.ExecutionMode") &&
         !M.getNamedMetadata("opencl.enable.FP_CONTRACT")) {
-      MCInst Inst;
-      Inst.setOpcode(SPIRV::OpExecutionMode);
-      Inst.addOperand(MCOperand::createReg(FReg));
-      unsigned EM = static_cast<unsigned>(SPIRV::ExecutionMode::ContractionOff);
-      Inst.addOperand(MCOperand::createImm(EM));
-      outputMCInst(Inst);
+      if (ST->canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2)) {
+        // When SPV_KHR_float_controls2 is enabled, ContractionOff is
+        // deprecated. We need to use FPFastMathDefault with the appropriate
+        // flags instead. Since FPFastMathDefault takes a target type, we need
+        // to emit it for each floating-point type that exists in the module
+        // to match the effect of ContractionOff. As of now, there are 3 FP
+        // types: fp16, fp32 and fp64.
+
+        // We only end up here because there is no "spirv.ExecutionMode"
+        // metadata, so that means no FPFastMathDefault. Therefore, we only
+        // need to make sure AllowContract is set to 0, as the rest of flags.
+        // We still need to emit the OpExecutionMode instruction, otherwise
+        // it's up to the client API to define the flags. Therefore, we need
+        // to find the constant with 0 value.
+
+        // Collect the SPIRVTypes for fp16, fp32, and fp64 and the constant of
+        // type int32 with 0 value to represent the FP Fast Math Mode.
+        std::vector<const MachineInstr *> SPIRVFloatTypes;
+        const MachineInstr *ConstZero = nullptr;
+        for (const MachineInstr *MI :
+             MAI->getMSInstrs(SPIRV::MB_TypeConstVars)) {
+          // Skip if the instruction is not OpTypeFloat or OpConstant.
+          unsigned OpCode = MI->getOpcode();
+          if (OpCode != SPIRV::OpTypeFloat && OpCode != SPIRV::OpConstantNull)
+            continue;
+
+          // Collect the SPIRV type if it's a float.
+          if (OpCode == SPIRV::OpTypeFloat) {
+            // Skip if the target type is not fp16, fp32, fp64.
+            const unsigned OpTypeFloatSize = MI->getOperand(1).getImm();
+            if (OpTypeFloatSize != 16 && OpTypeFloatSize != 32 &&
+                OpTypeFloatSize != 64) {
+              continue;
+            }
+            SPIRVFloatTypes.push_back(MI);
+          } else {
+            // Check if the constant is int32, if not skip it.
+            const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+            MachineInstr *TypeMI = MRI.getVRegDef(MI->getOperand(1).getReg());
+            if (!TypeMI || TypeMI->getOperand(1).getImm() != 32)
+              continue;
+
+            ConstZero = MI;
+          }
+        }
+
+        // When SPV_KHR_float_controls2 is enabled, ContractionOff is
+        // deprecated. We need to use FPFastMathDefault with the appropriate
+        // flags instead. Since FPFastMathDefault takes a target type, we need
+        // to emit it for each floating-point type that exists in the module
+        // to match the effect of ContractionOff. As of now, there are 3 FP
+        // types: fp16, fp32 and fp64.
+        for (const MachineInstr *MI : SPIRVFloatTypes) {
+          MCInst Inst;
+          Inst.setOpcode(SPIRV::OpExecutionModeId);
+          Inst.addOperand(MCOperand::createReg(FReg));
+          unsigned EM =
+              static_cast<unsigned>(SPIRV::ExecutionMode::FPFastMathDefault);
+          Inst.addOperand(MCOperand::createImm(EM));
+          const MachineFunction *MF = MI->getMF();
+          MCRegister TypeReg =
+              MAI->getRegisterAlias(MF, MI->getOperand(0).getReg());
+          Inst.addOperand(MCOperand::createReg(TypeReg));
+          assert(ConstZero && "There should be a constant zero.");
+          MCRegister ConstReg = MAI->getRegisterAlias(
+              ConstZero->getMF(), ConstZero->getOperand(0).getReg());
+          Inst.addOperand(MCOperand::createReg(ConstReg));
+          outputMCInst(Inst);
+        }
+      } else {
+        MCInst Inst;
+        Inst.setOpcode(SPIRV::OpExecutionMode);
+        Inst.addOperand(MCOperand::createReg(FReg));
+        unsigned EM =
+            static_cast<unsigned>(SPIRV::ExecutionMode::ContractionOff);
+        Inst.addOperand(MCOperand::createImm(EM));
+        outputMCInst(Inst);
+      }
     }
   }
 }
@@ -606,6 +695,101 @@ void SPIRVAsmPrinter::outputAnnotations(const Module &M) {
   }
 }
 
+void SPIRVAsmPrinter::outputFPFastMathDefaultInfo() {
+  // Collect the SPIRVTypes that are OpTypeFloat and the constants of type
+  // int32, that might be used as FP Fast Math Mode.
+  std::vector<const MachineInstr *> SPIRVFloatTypes;
+  // Hashtable to associate immediate values with the constant holding them.
+  std::unordered_map<int, const MachineInstr *> ConstMap;
+  for (const MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_TypeConstVars)) {
+    // Skip if the instruction is not OpTypeFloat or OpConstant.
+    unsigned OpCode = MI->getOpcode();
+    if (OpCode != SPIRV::OpTypeFloat && OpCode != SPIRV::OpConstantI &&
+        OpCode != SPIRV::OpConstantNull)
+      continue;
+
+    // Collect the SPIRV type if it's a float.
+    if (OpCode == SPIRV::OpTypeFloat) {
+      SPIRVFloatTypes.push_back(MI);
+    } else {
+      // Check if the constant is int32, if not skip it.
+      const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+      MachineInstr *TypeMI = MRI.getVRegDef(MI->getOperand(1).getReg());
+      if (!TypeMI || TypeMI->getOpcode() != SPIRV::OpTypeInt ||
+          TypeMI->getOperand(1).getImm() != 32)
+        continue;
+
+      if (OpCode == SPIRV::OpConstantI)
+        ConstMap[MI->getOperand(2).getImm()] = MI;
+      else
+        ConstMap[0] = MI;
+    }
+  }
+
+  for (const auto &[Func, FPFastMathDefaultInfoVec] :
+       MAI->FPFastMathDefaultInfoMap) {
+    if (FPFastMathDefaultInfoVec.empty())
+      continue;
+
+    for (const MachineInstr *MI : SPIRVFloatTypes) {
+      unsigned OpTypeFloatSize = MI->getOperand(1).getImm();
+      unsigned Index = SPIRV::FPFastMathDefaultInfoVector::
+          computeFPFastMathDefaultInfoVecIndex(OpTypeFloatSize);
+      assert(Index < FPFastMathDefaultInfoVec.size() &&
+             "Index out of bounds for FPFastMathDefaultInfoVec");
+      const auto &FPFastMathDefaultInfo = FPFastMathDefaultInfoVec[Index];
+      assert(FPFastMathDefaultInfo.Ty &&
+             "Expected target type for FPFastMathDefaultInfo");
+      assert(FPFastMathDefaultInfo.Ty->getScalarSizeInBits() ==
+                 OpTypeFloatSize &&
+             "Mismatched float type size");
+      MCInst Inst;
+      Inst.setOpcode(SPIRV::OpExecutionModeId);
+      MCRegister FuncReg = MAI->getFuncReg(Func);
+      assert(FuncReg.isValid());
+      Inst.addOperand(MCOperand::createReg(FuncReg));
+      Inst.addOperand(
+          MCOperand::createImm(SPIRV::ExecutionMode::FPFastMathDefault));
+      MCRegister TypeReg =
+          MAI->getRegisterAlias(MI->getMF(), MI->getOperand(0).getReg());
+      Inst.addOperand(MCOperand::createReg(TypeReg));
+      unsigned Flags = FPFastMathDefaultInfo.FastMathFlags;
+      if (FPFastMathDefaultInfo.ContractionOff &&
+          (Flags & SPIRV::FPFastMathMode::AllowContract))
+        report_fatal_error(
+            "Conflicting FPFastMathFlags: ContractionOff and AllowContract");
+
+      if (FPFastMathDefaultInfo.SignedZeroInfNanPreserve &&
+          !(Flags &
+            (SPIRV::FPFastMathMode::NotNaN | SPIRV::FPFastMathMode::NotInf |
+             SPIRV::FPFastMathMode::NSZ))) {
+        if (FPFastMathDefaultInfo.FPFastMathDefault)
+          report_fatal_error("Conflicting FPFastMathFlags: "
+                             "SignedZeroInfNanPreserve but at least one of "
+                             "NotNaN/NotInf/NSZ is enabled.");
+      }
+
+      // Don't emit if none of the execution modes was used.
+      if (Flags == SPIRV::FPFastMathMode::None &&
+          !FPFastMathDefaultInfo.ContractionOff &&
+          !FPFastMathDefaultInfo.SignedZeroInfNanPreserve &&
+          !FPFastMathDefaultInfo.FPFastMathDefault)
+        continue;
+
+      // Retrieve the constant instruction for the immediate value.
+      auto It = ConstMap.find(Flags);
+      if (It == ConstMap.end())
+        report_fatal_error("Expected constant instruction for FP Fast Math "
+                           "Mode operand of FPFastMathDefault execution mode.");
+      const MachineInstr *ConstMI = It->second;
+      MCRegister ConstReg = MAI->getRegisterAlias(
+          ConstMI->getMF(), ConstMI->getOperand(0).getReg());
+      Inst.addOperand(MCOperand::createReg(ConstReg));
+      outputMCInst(Inst);
+    }
+  }
+}
+
 void SPIRVAsmPrinter::outputModuleSections() {
   const Module *M = MMI->getModule();
   // Get the global subtarget to output module-level info.
@@ -614,7 +798,8 @@ void SPIRVAsmPrinter::outputModuleSections() {
   MAI = &SPIRVModuleAnalysis::MAI;
   assert(ST && TII && MAI && M && "Module analysis is required");
   // Output instructions according to the Logical Layout of a Module:
-  // 1,2. All OpCapability instructions, then optional OpExtension instructions.
+  // 1,2. All OpCapability instructions, then optional OpExtension
+  // instructions.
   outputGlobalRequirements();
   // 3. Optional OpExtInstImport instructions.
   outputOpExtInstImports(*M);
@@ -622,7 +807,8 @@ void SPIRVAsmPrinter::outputModuleSections() {
   outputOpMemoryModel();
   // 5. All entry point declarations, using OpEntryPoint.
   outputEntryPoints();
-  // 6. Execution-mode declarations, using OpExecutionMode or OpExecutionModeId.
+  // 6. Execution-mode declarations, using OpExecutionMode or
+  // OpExecutionModeId.
   outputExecutionMode(*M);
   // 7a. Debug: all OpString, OpSourceExtension, OpSource, and
   // OpSourceContinued, without forward references.
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index f704d3a..0e0c454 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1162,11 +1162,24 @@ static unsigned getNumSizeComponents(SPIRVType *imgType) {
 
 static bool generateExtInst(const SPIRV::IncomingCall *Call,
                             MachineIRBuilder &MIRBuilder,
-                            SPIRVGlobalRegistry *GR) {
+                            SPIRVGlobalRegistry *GR, const CallBase &CB) {
   // Lookup the extended instruction number in the TableGen records.
   const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
   uint32_t Number =
       SPIRV::lookupExtendedBuiltin(Builtin->Name, Builtin->Set)->Number;
+  // fmin_common and fmax_common are now deprecated, and we should use fmin and
+  // fmax with NotInf and NotNaN flags instead. Keep original number to add
+  // later the NoNans and NoInfs flags.
+  uint32_t OrigNumber = Number;
+  const SPIRVSubtarget &ST =
+      cast<SPIRVSubtarget>(MIRBuilder.getMF().getSubtarget());
+  if (ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2) &&
+      (Number == SPIRV::OpenCLExtInst::fmin_common ||
+       Number == SPIRV::OpenCLExtInst::fmax_common)) {
+    Number = (Number == SPIRV::OpenCLExtInst::fmin_common)
+                 ? SPIRV::OpenCLExtInst::fmin
+                 : SPIRV::OpenCLExtInst::fmax;
+  }
 
   // Build extended instruction.
   auto MIB =
@@ -1178,6 +1191,13 @@ static bool generateExtInst(const SPIRV::IncomingCall *Call,
 
   for (auto Argument : Call->Arguments)
     MIB.addUse(Argument);
+  MIB.getInstr()->copyIRFlags(CB);
+  if (OrigNumber == SPIRV::OpenCLExtInst::fmin_common ||
+      OrigNumber == SPIRV::OpenCLExtInst::fmax_common) {
+    // Add NoNans and NoInfs flags to fmin/fmax instruction.
+    MIB.getInstr()->setFlag(MachineInstr::MIFlag::FmNoNans);
+    MIB.getInstr()->setFlag(MachineInstr::MIFlag::FmNoInfs);
+  }
   return true;
 }
 
@@ -2908,7 +2928,7 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
                                  MachineIRBuilder &MIRBuilder,
                                  const Register OrigRet, const Type *OrigRetTy,
                                  const SmallVectorImpl<Register> &Args,
-                                 SPIRVGlobalRegistry *GR) {
+                                 SPIRVGlobalRegistry *GR, const CallBase &CB) {
   LLVM_DEBUG(dbgs() << "Lowering builtin call: " << DemangledCall << "\n");
 
   // Lookup the builtin in the TableGen records.
@@ -2931,7 +2951,7 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
   // Match the builtin with implementation based on the grouping.
   switch (Call->Builtin->Group) {
   case SPIRV::Extended:
-    return generateExtInst(Call.get(), MIRBuilder, GR);
+    return generateExtInst(Call.get(), MIRBuilder, GR, CB);
   case SPIRV::Relational:
     return generateRelationalInst(Call.get(), MIRBuilder, GR);
   case SPIRV::Group:
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.h b/llvm/lib/Target/SPIRV/SPIRVBuiltins.h
index 1a8641a..f6a5234 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.h
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.h
@@ -39,7 +39,7 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
                                  MachineIRBuilder &MIRBuilder,
                                  const Register OrigRet, const Type *OrigRetTy,
                                  const SmallVectorImpl<Register> &Args,
-                                 SPIRVGlobalRegistry *GR);
+                                 SPIRVGlobalRegistry *GR, const CallBase &CB);
 
 /// Helper function for finding a builtin function attributes
 /// by a demangled function name. Defined in SPIRVBuiltins.cpp.
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index a412887..1a7c02c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -641,9 +641,9 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                    GR->getPointerSize()));
       }
     }
-    if (auto Res =
-            SPIRV::lowerBuiltin(DemangledName, ST->getPreferredInstructionSet(),
-                                MIRBuilder, ResVReg, OrigRetTy, ArgVRegs, GR))
+    if (auto Res = SPIRV::lowerBuiltin(
+            DemangledName, ST->getPreferredInstructionSet(), MIRBuilder,
+            ResVReg, OrigRetTy, ArgVRegs, GR, *Info.CB))
       return *Res;
   }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 704edd3..9f2e075 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/TypedPointerType.h"
 #include "llvm/Transforms/Utils/Local.h"
 
+#include <cassert>
 #include <queue>
 #include <unordered_set>
 
@@ -152,6 +153,7 @@ class SPIRVEmitIntrinsics
   void insertPtrCastOrAssignTypeInstr(Instruction *I, IRBuilder<> &B);
   bool shouldTryToAddMemAliasingDecoration(Instruction *Inst);
   void insertSpirvDecorations(Instruction *I, IRBuilder<> &B);
+  void insertConstantsForFPFastMathDefault(Module &M);
   void processGlobalValue(GlobalVariable &GV, IRBuilder<> &B);
   void processParamTypes(Function *F, IRBuilder<> &B);
   void processParamTypesByFunHeader(Function *F, IRBuilder<> &B);
@@ -2249,6 +2251,198 @@ void SPIRVEmitIntrinsics::insertSpirvDecorations(Instruction *I,
   }
 }
 
+static SPIRV::FPFastMathDefaultInfoVector &getOrCreateFPFastMathDefaultInfoVec(
+    const Module &M,
+    DenseMap<Function *, SPIRV::FPFastMathDefaultInfoVector>
+        &FPFastMathDefaultInfoMap,
+    Function *F) {
+  auto it = FPFastMathDefaultInfoMap.find(F);
+  if (it != FPFastMathDefaultInfoMap.end())
+    return it->second;
+
+  // If the map does not contain the entry, create a new one. Initialize it to
+  // contain all 3 elements sorted by bit width of target type: {half, float,
+  // double}.
+  SPIRV::FPFastMathDefaultInfoVector FPFastMathDefaultInfoVec;
+  FPFastMathDefaultInfoVec.emplace_back(Type::getHalfTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  FPFastMathDefaultInfoVec.emplace_back(Type::getFloatTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  FPFastMathDefaultInfoVec.emplace_back(Type::getDoubleTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  return FPFastMathDefaultInfoMap[F] = std::move(FPFastMathDefaultInfoVec);
+}
+
+static SPIRV::FPFastMathDefaultInfo &getFPFastMathDefaultInfo(
+    SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec,
+    const Type *Ty) {
+  size_t BitWidth = Ty->getScalarSizeInBits();
+  int Index =
+      SPIRV::FPFastMathDefaultInfoVector::computeFPFastMathDefaultInfoVecIndex(
+          BitWidth);
+  assert(Index >= 0 && Index < 3 &&
+         "Expected FPFastMathDefaultInfo for half, float, or double");
+  assert(FPFastMathDefaultInfoVec.size() == 3 &&
+         "Expected FPFastMathDefaultInfoVec to have exactly 3 elements");
+  return FPFastMathDefaultInfoVec[Index];
+}
+
+void SPIRVEmitIntrinsics::insertConstantsForFPFastMathDefault(Module &M) {
+  const SPIRVSubtarget *ST = TM->getSubtargetImpl();
+  if (!ST->canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2))
+    return;
+
+  // Store the FPFastMathDefaultInfo in the FPFastMathDefaultInfoMap.
+  // We need the entry point (function) as the key, and the target
+  // type and flags as the value.
+  // We also need to check ContractionOff and SignedZeroInfNanPreserve
+  // execution modes, as they are now deprecated and must be replaced
+  // with FPFastMathDefaultInfo.
+  auto Node = M.getNamedMetadata("spirv.ExecutionMode");
+  if (!Node) {
+    if (!M.getNamedMetadata("opencl.enable.FP_CONTRACT")) {
+      // This requires emitting ContractionOff. However, because
+      // ContractionOff is now deprecated, we need to replace it with
+      // FPFastMathDefaultInfo with FP Fast Math Mode bitmask set to all 0.
+      // We need to create the constant for that.
+
+      // Create constant instruction with the bitmask flags.
+      Constant *InitValue =
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+      // TODO: Reuse constant if there is one already with the required
+      // value.
+      [[maybe_unused]] GlobalVariable *GV =
+          new GlobalVariable(M,                                // Module
+                             Type::getInt32Ty(M.getContext()), // Type
+                             true,                             // isConstant
+                             GlobalValue::InternalLinkage,     // Linkage
+                             InitValue                         // Initializer
+          );
+    }
+    return;
+  }
+
+  // The table maps function pointers to their default FP fast math info. It
+  // can be assumed that the SmallVector is sorted by the bit width of the
+  // type. The first element is the smallest bit width, and the last element
+  // is the largest bit width, therefore, we will have {half, float, double}
+  // in the order of their bit widths.
+  DenseMap<Function *, SPIRV::FPFastMathDefaultInfoVector>
+      FPFastMathDefaultInfoMap;
+
+  for (unsigned i = 0; i < Node->getNumOperands(); i++) {
+    MDNode *MDN = cast<MDNode>(Node->getOperand(i));
+    assert(MDN->getNumOperands() >= 2 && "Expected at least 2 operands");
+    Function *F = cast<Function>(
+        cast<ConstantAsMetadata>(MDN->getOperand(0))->getValue());
+    const auto EM =
+        cast<ConstantInt>(
+            cast<ConstantAsMetadata>(MDN->getOperand(1))->getValue())
+            ->getZExtValue();
+    if (EM == SPIRV::ExecutionMode::FPFastMathDefault) {
+      assert(MDN->getNumOperands() == 4 &&
+             "Expected 4 operands for FPFastMathDefault");
+      const Type *T = cast<ValueAsMetadata>(MDN->getOperand(2))->getType();
+      unsigned Flags =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(MDN->getOperand(3))->getValue())
+              ->getZExtValue();
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, FPFastMathDefaultInfoMap, F);
+      SPIRV::FPFastMathDefaultInfo &Info =
+          getFPFastMathDefaultInfo(FPFastMathDefaultInfoVec, T);
+      Info.FastMathFlags = Flags;
+      Info.FPFastMathDefault = true;
+    } else if (EM == SPIRV::ExecutionMode::ContractionOff) {
+      assert(MDN->getNumOperands() == 2 &&
+             "Expected no operands for ContractionOff");
+
+      // We need to save this info for every possible FP type, i.e. {half,
+      // float, double, fp128}.
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, FPFastMathDefaultInfoMap, F);
+      for (SPIRV::FPFastMathDefaultInfo &Info : FPFastMathDefaultInfoVec) {
+        Info.ContractionOff = true;
+      }
+    } else if (EM == SPIRV::ExecutionMode::SignedZeroInfNanPreserve) {
+      assert(MDN->getNumOperands() == 3 &&
+             "Expected 1 operand for SignedZeroInfNanPreserve");
+      unsigned TargetWidth =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(MDN->getOperand(2))->getValue())
+              ->getZExtValue();
+      // We need to save this info only for the FP type with TargetWidth.
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, FPFastMathDefaultInfoMap, F);
+      int Index = SPIRV::FPFastMathDefaultInfoVector::
+          computeFPFastMathDefaultInfoVecIndex(TargetWidth);
+      assert(Index >= 0 && Index < 3 &&
+             "Expected FPFastMathDefaultInfo for half, float, or double");
+      assert(FPFastMathDefaultInfoVec.size() == 3 &&
+             "Expected FPFastMathDefaultInfoVec to have exactly 3 elements");
+      FPFastMathDefaultInfoVec[Index].SignedZeroInfNanPreserve = true;
+    }
+  }
+
+  std::unordered_map<unsigned, GlobalVariable *> GlobalVars;
+  for (auto &[Func, FPFastMathDefaultInfoVec] : FPFastMathDefaultInfoMap) {
+    if (FPFastMathDefaultInfoVec.empty())
+      continue;
+
+    for (const SPIRV::FPFastMathDefaultInfo &Info : FPFastMathDefaultInfoVec) {
+      assert(Info.Ty && "Expected target type for FPFastMathDefaultInfo");
+      // Skip if none of the execution modes was used.
+      unsigned Flags = Info.FastMathFlags;
+      if (Flags == SPIRV::FPFastMathMode::None && !Info.ContractionOff &&
+          !Info.SignedZeroInfNanPreserve && !Info.FPFastMathDefault)
+        continue;
+
+      // Check if flags are compatible.
+      if (Info.ContractionOff && (Flags & SPIRV::FPFastMathMode::AllowContract))
+        report_fatal_error("Conflicting FPFastMathFlags: ContractionOff "
+                           "and AllowContract");
+
+      if (Info.SignedZeroInfNanPreserve &&
+          !(Flags &
+            (SPIRV::FPFastMathMode::NotNaN | SPIRV::FPFastMathMode::NotInf |
+             SPIRV::FPFastMathMode::NSZ))) {
+        if (Info.FPFastMathDefault)
+          report_fatal_error("Conflicting FPFastMathFlags: "
+                             "SignedZeroInfNanPreserve but at least one of "
+                             "NotNaN/NotInf/NSZ is enabled.");
+      }
+
+      if ((Flags & SPIRV::FPFastMathMode::AllowTransform) &&
+          !((Flags & SPIRV::FPFastMathMode::AllowReassoc) &&
+            (Flags & SPIRV::FPFastMathMode::AllowContract))) {
+        report_fatal_error("Conflicting FPFastMathFlags: "
+                           "AllowTransform requires AllowReassoc and "
+                           "AllowContract to be set.");
+      }
+
+      auto it = GlobalVars.find(Flags);
+      GlobalVariable *GV = nullptr;
+      if (it != GlobalVars.end()) {
+        // Reuse existing global variable.
+        GV = it->second;
+      } else {
+        // Create constant instruction with the bitmask flags.
+        Constant *InitValue =
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), Flags);
+        // TODO: Reuse constant if there is one already with the required
+        // value.
+        GV = new GlobalVariable(M,                                // Module
+                                Type::getInt32Ty(M.getContext()), // Type
+                                true,                             // isConstant
+                                GlobalValue::InternalLinkage,     // Linkage
+                                InitValue                         // Initializer
+        );
+        GlobalVars[Flags] = GV;
+      }
+    }
+  }
+}
+
 void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
                                                  IRBuilder<> &B) {
   auto *II = dyn_cast<IntrinsicInst>(I);
@@ -2569,9 +2763,9 @@ GetElementPtrInst *
 SPIRVEmitIntrinsics::simplifyZeroLengthArrayGepInst(GetElementPtrInst *GEP) {
   // getelementptr [0 x T], P, 0 (zero), I -> getelementptr T, P, I.
   // If type is 0-length array and first index is 0 (zero), drop both the
-  // 0-length array type and the first index. This is a common pattern in the
-  // IR, e.g. when using a zero-length array as a placeholder for a flexible
-  // array such as unbound arrays.
+  // 0-length array type and the first index. This is a common pattern in
+  // the IR, e.g. when using a zero-length array as a placeholder for a
+  // flexible array such as unbound arrays.
   assert(GEP && "GEP is null");
   Type *SrcTy = GEP->getSourceElementType();
   SmallVector<Value *, 8> Indices(GEP->indices());
@@ -2633,8 +2827,9 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
 
   processParamTypesByFunHeader(CurrF, B);
 
-  // StoreInst's operand type can be changed during the next transformations,
-  // so we need to store it in the set. Also store already transformed types.
+  // StoreInst's operand type can be changed during the next
+  // transformations, so we need to store it in the set. Also store already
+  // transformed types.
   for (auto &I : instructions(Func)) {
     StoreInst *SI = dyn_cast<StoreInst>(&I);
     if (!SI)
@@ -2681,8 +2876,8 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   for (auto &I : llvm::reverse(instructions(Func)))
     deduceOperandElementType(&I, &IncompleteRets);
 
-  // Pass forward for PHIs only, their operands are not preceed the instruction
-  // in meaning of `instructions(Func)`.
+  // Pass forward for PHIs only, their operands are not preceed the
+  // instruction in meaning of `instructions(Func)`.
   for (BasicBlock &BB : Func)
     for (PHINode &Phi : BB.phis())
       if (isPointerTy(Phi.getType()))
@@ -2692,8 +2887,8 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
     TrackConstants = true;
     if (!I->getType()->isVoidTy() || isa<StoreInst>(I))
       setInsertPointAfterDef(B, I);
-    // Visitors return either the original/newly created instruction for further
-    // processing, nullptr otherwise.
+    // Visitors return either the original/newly created instruction for
+    // further processing, nullptr otherwise.
     I = visit(*I);
     if (!I)
       continue;
@@ -2816,6 +3011,7 @@ bool SPIRVEmitIntrinsics::runOnModule(Module &M) {
   bool Changed = false;
 
   parseFunDeclarations(M);
+  insertConstantsForFPFastMathDefault(M);
 
   TodoType.clear();
   for (auto &F : M)
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 115766c..6fd1c7e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -806,7 +806,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
   // arguments.
   MDNode *GVarMD = nullptr;
   if (GVar && (GVarMD = GVar->getMetadata("spirv.Decorations")) != nullptr)
-    buildOpSpirvDecorations(Reg, MIRBuilder, GVarMD);
+    buildOpSpirvDecorations(Reg, MIRBuilder, GVarMD, ST);
 
   return Reg;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index 45e88fc..ba95ad8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -132,7 +132,8 @@ bool SPIRVInstrInfo::isHeaderInstr(const MachineInstr &MI) const {
   }
 }
 
-bool SPIRVInstrInfo::canUseFastMathFlags(const MachineInstr &MI) const {
+bool SPIRVInstrInfo::canUseFastMathFlags(const MachineInstr &MI,
+                                         bool KHRFloatControls2) const {
   switch (MI.getOpcode()) {
   case SPIRV::OpFAddS:
   case SPIRV::OpFSubS:
@@ -146,6 +147,24 @@ bool SPIRVInstrInfo::canUseFastMathFlags(const MachineInstr &MI) const {
   case SPIRV::OpFRemV:
   case SPIRV::OpFMod:
     return true;
+  case SPIRV::OpFNegateV:
+  case SPIRV::OpFNegate:
+  case SPIRV::OpOrdered:
+  case SPIRV::OpUnordered:
+  case SPIRV::OpFOrdEqual:
+  case SPIRV::OpFOrdNotEqual:
+  case SPIRV::OpFOrdLessThan:
+  case SPIRV::OpFOrdLessThanEqual:
+  case SPIRV::OpFOrdGreaterThan:
+  case SPIRV::OpFOrdGreaterThanEqual:
+  case SPIRV::OpFUnordEqual:
+  case SPIRV::OpFUnordNotEqual:
+  case SPIRV::OpFUnordLessThan:
+  case SPIRV::OpFUnordLessThanEqual:
+  case SPIRV::OpFUnordGreaterThan:
+  case SPIRV::OpFUnordGreaterThanEqual:
+  case SPIRV::OpExtInst:
+    return KHRFloatControls2 ? true : false;
   default:
     return false;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 72d2243..4de9d6a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -36,7 +36,8 @@ public:
   bool isTypeDeclInstr(const MachineInstr &MI) const;
   bool isDecorationInstr(const MachineInstr &MI) const;
   bool isAliasingInstr(const MachineInstr &MI) const;
-  bool canUseFastMathFlags(const MachineInstr &MI) const;
+  bool canUseFastMathFlags(const MachineInstr &MI,
+                           bool KHRFloatControls2) const;
   bool canUseNSW(const MachineInstr &MI) const;
   bool canUseNUW(const MachineInstr &MI) const;
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 1aadd9d..273edf3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1073,7 +1073,8 @@ bool SPIRVInstructionSelector::selectExtInst(Register ResVReg,
                      .addDef(ResVReg)
                      .addUse(GR.getSPIRVTypeID(ResType))
                      .addImm(static_cast<uint32_t>(Set))
-                     .addImm(Opcode);
+                     .addImm(Opcode)
+                     .setMIFlags(I.getFlags());
       const unsigned NumOps = I.getNumOperands();
       unsigned Index = 1;
       if (Index < NumOps &&
@@ -2629,6 +2630,7 @@ bool SPIRVInstructionSelector::selectCmp(Register ResVReg,
       .addUse(GR.getSPIRVTypeID(ResType))
       .addUse(Cmp0)
       .addUse(Cmp1)
+      .setMIFlags(I.getFlags())
       .constrainAllUses(TII, TRI, RBI);
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index bc159d5..dc717a6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -248,6 +248,22 @@ static InstrSignature instrToSignature(const MachineInstr &MI,
   Register DefReg;
   InstrSignature Signature{MI.getOpcode()};
   for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+    // The only decorations that can be applied more than once to a given <id>
+    // or structure member are UserSemantic(5635), CacheControlLoadINTEL (6442),
+    // and CacheControlStoreINTEL (6443). For all the rest of decorations, we
+    // will only add to the signature the Opcode, the id to which it applies,
+    // and the decoration id, disregarding any decoration flags. This will
+    // ensure that any subsequent decoration with the same id will be deemed as
+    // a duplicate. Then, at the call site, we will be able to handle duplicates
+    // in the best way.
+    unsigned Opcode = MI.getOpcode();
+    if ((Opcode == SPIRV::OpDecorate) && i >= 2) {
+      unsigned DecorationID = MI.getOperand(1).getImm();
+      if (DecorationID != SPIRV::Decoration::UserSemantic &&
+          DecorationID != SPIRV::Decoration::CacheControlLoadINTEL &&
+          DecorationID != SPIRV::Decoration::CacheControlStoreINTEL)
+        continue;
+    }
     const MachineOperand &MO = MI.getOperand(i);
     size_t h;
     if (MO.isReg()) {
@@ -559,8 +575,54 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
   MAI.setSkipEmission(&MI);
   InstrSignature MISign = instrToSignature(MI, MAI, true);
   auto FoundMI = IS.insert(std::move(MISign));
-  if (!FoundMI.second)
+  if (!FoundMI.second) {
+    if (MI.getOpcode() == SPIRV::OpDecorate) {
+      assert(MI.getNumOperands() >= 2 &&
+             "Decoration instructions must have at least 2 operands");
+      assert(MSType == SPIRV::MB_Annotations &&
+             "Only OpDecorate instructions can be duplicates");
+      // For FPFastMathMode decoration, we need to merge the flags of the
+      // duplicate decoration with the original one, so we need to find the
+      // original instruction that has the same signature. For the rest of
+      // instructions, we will simply skip the duplicate.
+      if (MI.getOperand(1).getImm() != SPIRV::Decoration::FPFastMathMode)
+        return; // Skip duplicates of other decorations.
+
+      const SPIRV::InstrList &Decorations = MAI.MS[MSType];
+      for (const MachineInstr *OrigMI : Decorations) {
+        if (instrToSignature(*OrigMI, MAI, true) == MISign) {
+          assert(OrigMI->getNumOperands() == MI.getNumOperands() &&
+                 "Original instruction must have the same number of operands");
+          assert(
+              OrigMI->getNumOperands() == 3 &&
+              "FPFastMathMode decoration must have 3 operands for OpDecorate");
+          unsigned OrigFlags = OrigMI->getOperand(2).getImm();
+          unsigned NewFlags = MI.getOperand(2).getImm();
+          if (OrigFlags == NewFlags)
+            return; // No need to merge, the flags are the same.
+
+          // Emit warning about possible conflict between flags.
+          unsigned FinalFlags = OrigFlags | NewFlags;
+          llvm::errs()
+              << "Warning: Conflicting FPFastMathMode decoration flags "
+                 "in instruction: "
+              << *OrigMI << "Original flags: " << OrigFlags
+              << ", new flags: " << NewFlags
+              << ". They will be merged on a best effort basis, but not "
+                 "validated. Final flags: "
+              << FinalFlags << "\n";
+          MachineInstr *OrigMINonConst = const_cast<MachineInstr *>(OrigMI);
+          MachineOperand &OrigFlagsOp = OrigMINonConst->getOperand(2);
+          OrigFlagsOp =
+              MachineOperand::CreateImm(static_cast<unsigned>(FinalFlags));
+          return; // Merge done, so we found a duplicate; don't add it to MAI.MS
+        }
+      }
+      assert(false && "No original instruction found for the duplicate "
+                      "OpDecorate, but we found one in IS.");
+    }
     return; // insert failed, so we found a duplicate; don't add it to MAI.MS
+  }
   // No duplicates, so add it.
   if (Append)
     MAI.MS[MSType].push_back(&MI);
@@ -934,6 +996,11 @@ static void addOpDecorateReqs(const MachineInstr &MI, unsigned DecIndex,
   } else if (Dec == SPIRV::Decoration::FPMaxErrorDecorationINTEL) {
     Reqs.addRequirements(SPIRV::Capability::FPMaxErrorINTEL);
     Reqs.addExtension(SPIRV::Extension::SPV_INTEL_fp_max_error);
+  } else if (Dec == SPIRV::Decoration::FPFastMathMode) {
+    if (ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2)) {
+      Reqs.addRequirements(SPIRV::Capability::FloatControls2);
+      Reqs.addExtension(SPIRV::Extension::SPV_KHR_float_controls2);
+    }
   }
 }
 
@@ -1994,10 +2061,13 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
   // Collect requirements for OpExecutionMode instructions.
   auto Node = M.getNamedMetadata("spirv.ExecutionMode");
   if (Node) {
-    bool RequireFloatControls = false, RequireFloatControls2 = false,
+    bool RequireFloatControls = false, RequireIntelFloatControls2 = false,
+         RequireKHRFloatControls2 = false,
          VerLower14 = !ST.isAtLeastSPIRVVer(VersionTuple(1, 4));
-    bool HasFloatControls2 =
+    bool HasIntelFloatControls2 =
         ST.canUseExtension(SPIRV::Extension::SPV_INTEL_float_controls2);
+    bool HasKHRFloatControls2 =
+        ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2);
     for (unsigned i = 0; i < Node->getNumOperands(); i++) {
       MDNode *MDN = cast<MDNode>(Node->getOperand(i));
       const MDOperand &MDOp = MDN->getOperand(1);
@@ -2010,7 +2080,6 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
           switch (EM) {
           case SPIRV::ExecutionMode::DenormPreserve:
           case SPIRV::ExecutionMode::DenormFlushToZero:
-          case SPIRV::ExecutionMode::SignedZeroInfNanPreserve:
           case SPIRV::ExecutionMode::RoundingModeRTE:
           case SPIRV::ExecutionMode::RoundingModeRTZ:
             RequireFloatControls = VerLower14;
@@ -2021,8 +2090,28 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
           case SPIRV::ExecutionMode::RoundingModeRTNINTEL:
           case SPIRV::ExecutionMode::FloatingPointModeALTINTEL:
           case SPIRV::ExecutionMode::FloatingPointModeIEEEINTEL:
-            if (HasFloatControls2) {
-              RequireFloatControls2 = true;
+            if (HasIntelFloatControls2) {
+              RequireIntelFloatControls2 = true;
+              MAI.Reqs.getAndAddRequirements(
+                  SPIRV::OperandCategory::ExecutionModeOperand, EM, ST);
+            }
+            break;
+          case SPIRV::ExecutionMode::FPFastMathDefault: {
+            if (HasKHRFloatControls2) {
+              RequireKHRFloatControls2 = true;
+              MAI.Reqs.getAndAddRequirements(
+                  SPIRV::OperandCategory::ExecutionModeOperand, EM, ST);
+            }
+            break;
+          }
+          case SPIRV::ExecutionMode::ContractionOff:
+          case SPIRV::ExecutionMode::SignedZeroInfNanPreserve:
+            if (HasKHRFloatControls2) {
+              RequireKHRFloatControls2 = true;
+              MAI.Reqs.getAndAddRequirements(
+                  SPIRV::OperandCategory::ExecutionModeOperand,
+                  SPIRV::ExecutionMode::FPFastMathDefault, ST);
+            } else {
               MAI.Reqs.getAndAddRequirements(
                   SPIRV::OperandCategory::ExecutionModeOperand, EM, ST);
             }
@@ -2037,8 +2126,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
     if (RequireFloatControls &&
         ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls))
       MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_float_controls);
-    if (RequireFloatControls2)
+    if (RequireIntelFloatControls2)
       MAI.Reqs.addExtension(SPIRV::Extension::SPV_INTEL_float_controls2);
+    if (RequireKHRFloatControls2)
+      MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_float_controls2);
   }
   for (auto FI = M.begin(), E = M.end(); FI != E; ++FI) {
     const Function &F = *FI;
@@ -2078,8 +2169,11 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
   }
 }
 
-static unsigned getFastMathFlags(const MachineInstr &I) {
+static unsigned getFastMathFlags(const MachineInstr &I,
+                                 const SPIRVSubtarget &ST) {
   unsigned Flags = SPIRV::FPFastMathMode::None;
+  bool CanUseKHRFloatControls2 =
+      ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2);
   if (I.getFlag(MachineInstr::MIFlag::FmNoNans))
     Flags |= SPIRV::FPFastMathMode::NotNaN;
   if (I.getFlag(MachineInstr::MIFlag::FmNoInfs))
@@ -2088,12 +2182,45 @@ static unsigned getFastMathFlags(const MachineInstr &I) {
     Flags |= SPIRV::FPFastMathMode::NSZ;
   if (I.getFlag(MachineInstr::MIFlag::FmArcp))
     Flags |= SPIRV::FPFastMathMode::AllowRecip;
-  if (I.getFlag(MachineInstr::MIFlag::FmReassoc))
-    Flags |= SPIRV::FPFastMathMode::Fast;
+  if (I.getFlag(MachineInstr::MIFlag::FmContract) && CanUseKHRFloatControls2)
+    Flags |= SPIRV::FPFastMathMode::AllowContract;
+  if (I.getFlag(MachineInstr::MIFlag::FmReassoc)) {
+    if (CanUseKHRFloatControls2)
+      // LLVM reassoc maps to SPIRV transform, see
+      // https://github.com/KhronosGroup/SPIRV-Registry/issues/326 for details.
+      // Because we are enabling AllowTransform, we must enable AllowReassoc and
+      // AllowContract too, as required by SPIRV spec. Also, we used to map
+      // MIFlag::FmReassoc to FPFastMathMode::Fast, which now should instead by
+      // replaced by turning all the other bits instead. Therefore, we're
+      // enabling every bit here except None and Fast.
+      Flags |= SPIRV::FPFastMathMode::NotNaN | SPIRV::FPFastMathMode::NotInf |
+               SPIRV::FPFastMathMode::NSZ | SPIRV::FPFastMathMode::AllowRecip |
+               SPIRV::FPFastMathMode::AllowTransform |
+               SPIRV::FPFastMathMode::AllowReassoc |
+               SPIRV::FPFastMathMode::AllowContract;
+    else
+      Flags |= SPIRV::FPFastMathMode::Fast;
+  }
+
+  if (CanUseKHRFloatControls2) {
+    // Error out if SPIRV::FPFastMathMode::Fast is enabled.
+    assert(!(Flags & SPIRV::FPFastMathMode::Fast) &&
+           "SPIRV::FPFastMathMode::Fast is deprecated and should not be used "
+           "anymore.");
+
+    // Error out if AllowTransform is enabled without AllowReassoc and
+    // AllowContract.
+    assert((!(Flags & SPIRV::FPFastMathMode::AllowTransform) ||
+            ((Flags & SPIRV::FPFastMathMode::AllowReassoc &&
+              Flags & SPIRV::FPFastMathMode::AllowContract))) &&
+           "SPIRV::FPFastMathMode::AllowTransform requires AllowReassoc and "
+           "AllowContract flags to be enabled as well.");
+  }
+
   return Flags;
 }
 
-static bool isFastMathMathModeAvailable(const SPIRVSubtarget &ST) {
+static bool isFastMathModeAvailable(const SPIRVSubtarget &ST) {
   if (ST.isKernel())
     return true;
   if (ST.getSPIRVVersion() < VersionTuple(1, 2))
@@ -2101,9 +2228,10 @@ static bool isFastMathMathModeAvailable(const SPIRVSubtarget &ST) {
   return ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2);
 }
 
-static void handleMIFlagDecoration(MachineInstr &I, const SPIRVSubtarget &ST,
-                                   const SPIRVInstrInfo &TII,
-                                   SPIRV::RequirementHandler &Reqs) {
+static void handleMIFlagDecoration(
+    MachineInstr &I, const SPIRVSubtarget &ST, const SPIRVInstrInfo &TII,
+    SPIRV::RequirementHandler &Reqs, const SPIRVGlobalRegistry *GR,
+    SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec) {
   if (I.getFlag(MachineInstr::MIFlag::NoSWrap) && TII.canUseNSW(I) &&
       getSymbolicOperandRequirements(SPIRV::OperandCategory::DecorationOperand,
                                      SPIRV::Decoration::NoSignedWrap, ST, Reqs)
@@ -2119,13 +2247,53 @@ static void handleMIFlagDecoration(MachineInstr &I, const SPIRVSubtarget &ST,
     buildOpDecorate(I.getOperand(0).getReg(), I, TII,
                     SPIRV::Decoration::NoUnsignedWrap, {});
   }
-  if (!TII.canUseFastMathFlags(I))
-    return;
-  unsigned FMFlags = getFastMathFlags(I);
-  if (FMFlags == SPIRV::FPFastMathMode::None)
+  if (!TII.canUseFastMathFlags(
+          I, ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2)))
     return;
 
-  if (isFastMathMathModeAvailable(ST)) {
+  unsigned FMFlags = getFastMathFlags(I, ST);
+  if (FMFlags == SPIRV::FPFastMathMode::None) {
+    // We also need to check if any FPFastMathDefault info was set for the
+    // types used in this instruction.
+    if (FPFastMathDefaultInfoVec.empty())
+      return;
+
+    // There are three types of instructions that can use fast math flags:
+    // 1. Arithmetic instructions (FAdd, FMul, FSub, FDiv, FRem, etc.)
+    // 2. Relational instructions (FCmp, FOrd, FUnord, etc.)
+    // 3. Extended instructions (ExtInst)
+    // For arithmetic instructions, the floating point type can be in the
+    // result type or in the operands, but they all must be the same.
+    // For the relational and logical instructions, the floating point type
+    // can only be in the operands 1 and 2, not the result type. Also, the
+    // operands must have the same type. For the extended instructions, the
+    // floating point type can be in the result type or in the operands. It's
+    // unclear if the operands and the result type must be the same. Let's
+    // assume they must be. Therefore, for 1. and 2., we can check the first
+    // operand type, and for 3. we can check the result type.
+    assert(I.getNumOperands() >= 3 && "Expected at least 3 operands");
+    Register ResReg = I.getOpcode() == SPIRV::OpExtInst
+                          ? I.getOperand(1).getReg()
+                          : I.getOperand(2).getReg();
+    SPIRVType *ResType = GR->getSPIRVTypeForVReg(ResReg, I.getMF());
+    const Type *Ty = GR->getTypeForSPIRVType(ResType);
+    Ty = Ty->isVectorTy() ? cast<VectorType>(Ty)->getElementType() : Ty;
+
+    // Match instruction type with the FPFastMathDefaultInfoVec.
+    bool Emit = false;
+    for (SPIRV::FPFastMathDefaultInfo &Elem : FPFastMathDefaultInfoVec) {
+      if (Ty == Elem.Ty) {
+        FMFlags = Elem.FastMathFlags;
+        Emit = Elem.ContractionOff || Elem.SignedZeroInfNanPreserve ||
+               Elem.FPFastMathDefault;
+        break;
+      }
+    }
+
+    if (FMFlags == SPIRV::FPFastMathMode::None && !Emit)
+      return;
+  }
+  if (isFastMathModeAvailable(ST)) {
     Register DstReg = I.getOperand(0).getReg();
     buildOpDecorate(DstReg, I, TII, SPIRV::Decoration::FPFastMathMode,
                     {FMFlags});
@@ -2135,14 +2303,17 @@ static void handleMIFlagDecoration(MachineInstr &I, const SPIRVSubtarget &ST,
 // Walk all functions and add decorations related to MI flags.
 static void addDecorations(const Module &M, const SPIRVInstrInfo &TII,
                            MachineModuleInfo *MMI, const SPIRVSubtarget &ST,
-                           SPIRV::ModuleAnalysisInfo &MAI) {
+                           SPIRV::ModuleAnalysisInfo &MAI,
+                           const SPIRVGlobalRegistry *GR) {
   for (auto F = M.begin(), E = M.end(); F != E; ++F) {
     MachineFunction *MF = MMI->getMachineFunction(*F);
     if (!MF)
       continue;
+
     for (auto &MBB : *MF)
       for (auto &MI : MBB)
-        handleMIFlagDecoration(MI, ST, TII, MAI.Reqs);
+        handleMIFlagDecoration(MI, ST, TII, MAI.Reqs, GR,
+                               MAI.FPFastMathDefaultInfoMap[&(*F)]);
   }
 }
 
@@ -2188,6 +2359,111 @@ static void patchPhis(const Module &M, SPIRVGlobalRegistry *GR,
   }
 }
 
+static SPIRV::FPFastMathDefaultInfoVector &getOrCreateFPFastMathDefaultInfoVec(
+    const Module &M, SPIRV::ModuleAnalysisInfo &MAI, const Function *F) {
+  auto it = MAI.FPFastMathDefaultInfoMap.find(F);
+  if (it != MAI.FPFastMathDefaultInfoMap.end())
+    return it->second;
+
+  // If the map does not contain the entry, create a new one. Initialize it to
+  // contain all 3 elements sorted by bit width of target type: {half, float,
+  // double}.
+  SPIRV::FPFastMathDefaultInfoVector FPFastMathDefaultInfoVec;
+  FPFastMathDefaultInfoVec.emplace_back(Type::getHalfTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  FPFastMathDefaultInfoVec.emplace_back(Type::getFloatTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  FPFastMathDefaultInfoVec.emplace_back(Type::getDoubleTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  return MAI.FPFastMathDefaultInfoMap[F] = std::move(FPFastMathDefaultInfoVec);
+}
+
+static SPIRV::FPFastMathDefaultInfo &getFPFastMathDefaultInfo(
+    SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec,
+    const Type *Ty) {
+  size_t BitWidth = Ty->getScalarSizeInBits();
+  int Index =
+      SPIRV::FPFastMathDefaultInfoVector::computeFPFastMathDefaultInfoVecIndex(
+          BitWidth);
+  assert(Index >= 0 && Index < 3 &&
+         "Expected FPFastMathDefaultInfo for half, float, or double");
+  assert(FPFastMathDefaultInfoVec.size() == 3 &&
+         "Expected FPFastMathDefaultInfoVec to have exactly 3 elements");
+  return FPFastMathDefaultInfoVec[Index];
+}
+
+static void collectFPFastMathDefaults(const Module &M,
+                                      SPIRV::ModuleAnalysisInfo &MAI,
+                                      const SPIRVSubtarget &ST) {
+  if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2))
+    return;
+
+  // Store the FPFastMathDefaultInfo in the FPFastMathDefaultInfoMap.
+  // We need the entry point (function) as the key, and the target
+  // type and flags as the value.
+  // We also need to check ContractionOff and SignedZeroInfNanPreserve
+  // execution modes, as they are now deprecated and must be replaced
+  // with FPFastMathDefaultInfo.
+  auto Node = M.getNamedMetadata("spirv.ExecutionMode");
+  if (!Node)
+    return;
+
+  for (unsigned i = 0; i < Node->getNumOperands(); i++) {
+    MDNode *MDN = cast<MDNode>(Node->getOperand(i));
+    assert(MDN->getNumOperands() >= 2 && "Expected at least 2 operands");
+    const Function *F = cast<Function>(
+        cast<ConstantAsMetadata>(MDN->getOperand(0))->getValue());
+    const auto EM =
+        cast<ConstantInt>(
+            cast<ConstantAsMetadata>(MDN->getOperand(1))->getValue())
+            ->getZExtValue();
+    if (EM == SPIRV::ExecutionMode::FPFastMathDefault) {
+      assert(MDN->getNumOperands() == 4 &&
+             "Expected 4 operands for FPFastMathDefault");
+
+      const Type *T = cast<ValueAsMetadata>(MDN->getOperand(2))->getType();
+      unsigned Flags =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(MDN->getOperand(3))->getValue())
+              ->getZExtValue();
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, MAI, F);
+      SPIRV::FPFastMathDefaultInfo &Info =
+          getFPFastMathDefaultInfo(FPFastMathDefaultInfoVec, T);
+      Info.FastMathFlags = Flags;
+      Info.FPFastMathDefault = true;
+    } else if (EM == SPIRV::ExecutionMode::ContractionOff) {
+      assert(MDN->getNumOperands() == 2 &&
+             "Expected no operands for ContractionOff");
+
+      // We need to save this info for every possible FP type, i.e. {half,
+      // float, double, fp128}.
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, MAI, F);
+      for (SPIRV::FPFastMathDefaultInfo &Info : FPFastMathDefaultInfoVec) {
+        Info.ContractionOff = true;
+      }
+    } else if (EM == SPIRV::ExecutionMode::SignedZeroInfNanPreserve) {
+      assert(MDN->getNumOperands() == 3 &&
+             "Expected 1 operand for SignedZeroInfNanPreserve");
+      unsigned TargetWidth =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(MDN->getOperand(2))->getValue())
+              ->getZExtValue();
+      // We need to save this info only for the FP type with TargetWidth.
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, MAI, F);
+      int Index = SPIRV::FPFastMathDefaultInfoVector::
+          computeFPFastMathDefaultInfoVecIndex(TargetWidth);
+      assert(Index >= 0 && Index < 3 &&
+             "Expected FPFastMathDefaultInfo for half, float, or double");
+      assert(FPFastMathDefaultInfoVec.size() == 3 &&
+             "Expected FPFastMathDefaultInfoVec to have exactly 3 elements");
+      FPFastMathDefaultInfoVec[Index].SignedZeroInfNanPreserve = true;
+    }
+  }
+}
+
 struct SPIRV::ModuleAnalysisInfo SPIRVModuleAnalysis::MAI;
 
 void SPIRVModuleAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -2209,7 +2485,8 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
   patchPhis(M, GR, *TII, MMI);
 
   addMBBNames(M, *TII, MMI, *ST, MAI);
-  addDecorations(M, *TII, MMI, *ST, MAI);
+  collectFPFastMathDefaults(M, MAI, *ST);
+  addDecorations(M, *TII, MMI, *ST, MAI, GR);
 
   collectReqs(M, MAI, MMI, *ST);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 41c792a..d8376cd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -159,6 +159,13 @@ struct ModuleAnalysisInfo {
   InstrList MS[NUM_MODULE_SECTIONS];
   // The table maps MBB number to SPIR-V unique ID register.
   DenseMap<std::pair<const MachineFunction *, int>, MCRegister> BBNumToRegMap;
+  // The table maps function pointers to their default FP fast math info. It can
+  // be assumed that the SmallVector is sorted by the bit width of the type. The
+  // first element is the smallest bit width, and the last element is the
+  // largest bit width, therefore, we will have {half, float, double} in
+  // the order of their bit widths.
+  DenseMap<const Function *, SPIRV::FPFastMathDefaultInfoVector>
+      FPFastMathDefaultInfoMap;
 
   MCRegister getFuncReg(const Function *F) {
     assert(F && "Function is null");
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 1a08c6a..db6f2d6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -839,6 +839,7 @@ static uint32_t convertFloatToSPIRVWord(float F) {
 
 static void insertSpirvDecorations(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                                    MachineIRBuilder MIB) {
+  const SPIRVSubtarget &ST = cast<SPIRVSubtarget>(MIB.getMF().getSubtarget());
   SmallVector<MachineInstr *, 10> ToErase;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
@@ -849,7 +850,7 @@ static void insertSpirvDecorations(MachineFunction &MF, SPIRVGlobalRegistry *GR,
       MIB.setInsertPt(*MI.getParent(), MI.getNextNode());
       if (isSpvIntrinsic(MI, Intrinsic::spv_assign_decoration)) {
         buildOpSpirvDecorations(MI.getOperand(1).getReg(), MIB,
-                                MI.getOperand(2).getMetadata());
+                                MI.getOperand(2).getMetadata(), ST);
       } else if (isSpvIntrinsic(MI,
                                 Intrinsic::spv_assign_fpmaxerror_decoration)) {
         ConstantFP *OpV = mdconst::dyn_extract<ConstantFP>(
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 66ce5a2..6a32dba 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -802,6 +802,7 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>;
 defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
 defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
 defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
+defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define StorageClass enum values and at the same time
@@ -1153,6 +1154,9 @@ defm NotInf : FPFastMathModeOperand<0x2, [Kernel]>;
 defm NSZ : FPFastMathModeOperand<0x4, [Kernel]>;
 defm AllowRecip : FPFastMathModeOperand<0x8, [Kernel]>;
 defm Fast : FPFastMathModeOperand<0x10, [Kernel]>;
+defm AllowContract : FPFastMathModeOperand<0x10000, [FloatControls2]>;
+defm AllowReassoc : FPFastMathModeOperand<0x20000, [FloatControls2]>;
+defm AllowTransform : FPFastMathModeOperand<0x40000, [FloatControls2]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define FPRoundingMode enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 820e56b..327c011 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -181,7 +181,7 @@ void buildOpMemberDecorate(Register Reg, MachineInstr &I,
 }
 
 void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder,
-                             const MDNode *GVarMD) {
+                             const MDNode *GVarMD, const SPIRVSubtarget &ST) {
   for (unsigned I = 0, E = GVarMD->getNumOperands(); I != E; ++I) {
     auto *OpMD = dyn_cast<MDNode>(GVarMD->getOperand(I));
     if (!OpMD)
@@ -193,6 +193,20 @@ void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder,
     if (!DecorationId)
       report_fatal_error("Expect SPIR-V <Decoration> operand to be the first "
                          "element of the decoration");
+
+    // The goal of `spirv.Decorations` metadata is to provide a way to
+    // represent SPIR-V entities that do not map to LLVM in an obvious way.
+    // FP flags do have obvious matches between LLVM IR and SPIR-V.
+    // Additionally, we have no guarantee at this point that the flags passed
+    // through the decoration are not violated already in the optimizer passes.
+    // Therefore, we simply ignore FP flags, including NoContraction, and
+    // FPFastMathMode.
+    if (DecorationId->getZExtValue() ==
+            static_cast<uint32_t>(SPIRV::Decoration::NoContraction) ||
+        DecorationId->getZExtValue() ==
+            static_cast<uint32_t>(SPIRV::Decoration::FPFastMathMode)) {
+      continue; // Ignored.
+    }
     auto MIB = MIRBuilder.buildInstr(SPIRV::OpDecorate)
                    .addUse(Reg)
                    .addImm(static_cast<uint32_t>(DecorationId->getZExtValue()));
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index 45c520a..409a0fd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -113,6 +113,54 @@ public:
                          std::function<bool(BasicBlock *)> Op);
 };
 
+namespace SPIRV {
+struct FPFastMathDefaultInfo {
+  const Type *Ty = nullptr;
+  unsigned FastMathFlags = 0;
+  // When SPV_KHR_float_controls2 ContractionOff and SignzeroInfNanPreserve are
+  // deprecated, and we replace them with FPFastMathDefault appropriate flags
+  // instead. However, we have no guarantee about the order in which we will
+  // process execution modes. Therefore it could happen that we first process
+  // ContractionOff, setting AllowContraction bit to 0, and then we process
+  // FPFastMathDefault enabling AllowContraction bit, effectively invalidating
+  // ContractionOff. Because of that, it's best to keep separate bits for the
+  // different execution modes, and we will try and combine them later when we
+  // emit OpExecutionMode instructions.
+  bool ContractionOff = false;
+  bool SignedZeroInfNanPreserve = false;
+  bool FPFastMathDefault = false;
+
+  FPFastMathDefaultInfo() = default;
+  FPFastMathDefaultInfo(const Type *Ty, unsigned FastMathFlags)
+      : Ty(Ty), FastMathFlags(FastMathFlags) {}
+  bool operator==(const FPFastMathDefaultInfo &Other) const {
+    return Ty == Other.Ty && FastMathFlags == Other.FastMathFlags &&
+           ContractionOff == Other.ContractionOff &&
+           SignedZeroInfNanPreserve == Other.SignedZeroInfNanPreserve &&
+           FPFastMathDefault == Other.FPFastMathDefault;
+  }
+};
+
+struct FPFastMathDefaultInfoVector
+    : public SmallVector<SPIRV::FPFastMathDefaultInfo, 3> {
+  static size_t computeFPFastMathDefaultInfoVecIndex(size_t BitWidth) {
+    switch (BitWidth) {
+    case 16: // half
+      return 0;
+    case 32: // float
+      return 1;
+    case 64: // double
+      return 2;
+    default:
+      report_fatal_error("Expected BitWidth to be 16, 32, 64", false);
+    }
+    llvm_unreachable(
+        "Unreachable code in computeFPFastMathDefaultInfoVecIndex");
+  }
+};
+
+} // namespace SPIRV
+
 // Add the given string as a series of integer operand, inserting null
 // terminators and padding to make sure the operands all have 32-bit
 // little-endian words.
@@ -161,7 +209,7 @@ void buildOpMemberDecorate(Register Reg, MachineInstr &I,
 
 // Add an OpDecorate instruction by "spirv.Decorations" metadata node.
 void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder,
-                             const MDNode *GVarMD);
+                             const MDNode *GVarMD, const SPIRVSubtarget &ST);
 
 // Return a valid position for the OpVariable instruction inside a function,
 // i.e., at the beginning of the first block of the function.
@@ -508,6 +556,5 @@ unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
                                 const MachineInstr *ResType);
 MachineBasicBlock::iterator
 getFirstValidInstructionInsertPoint(MachineBasicBlock &BB);
-
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 64b9dc3..163bf9b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -186,7 +186,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   // SIMD-specific configuration
   if (Subtarget->hasSIMD128()) {
 
-    // Combine partial.reduce.add before legalization gets confused.
     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
     // Combine wide-vector muls, with extend inputs, to extmul_half.
@@ -317,6 +316,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Custom);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom);
     }
+
+    // Partial MLA reductions.
+    for (auto Op : {ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA}) {
+      setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v16i8, Legal);
+      setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v8i16, Legal);
+    }
   }
 
   // As a special case, these operators use the type to mean the type to
@@ -416,41 +421,6 @@ MVT WebAssemblyTargetLowering::getPointerMemTy(const DataLayout &DL,
   return TargetLowering::getPointerMemTy(DL, AS);
 }
 
-bool WebAssemblyTargetLowering::shouldExpandPartialReductionIntrinsic(
-    const IntrinsicInst *I) const {
-  if (I->getIntrinsicID() != Intrinsic::vector_partial_reduce_add)
-    return true;
-
-  EVT VT = EVT::getEVT(I->getType());
-  if (VT.getSizeInBits() > 128)
-    return true;
-
-  auto Op1 = I->getOperand(1);
-
-  if (auto *InputInst = dyn_cast<Instruction>(Op1)) {
-    unsigned Opcode = InstructionOpcodeToISD(InputInst->getOpcode());
-    if (Opcode == ISD::MUL) {
-      if (isa<Instruction>(InputInst->getOperand(0)) &&
-          isa<Instruction>(InputInst->getOperand(1))) {
-        // dot only supports signed inputs but also support lowering unsigned.
-        if (cast<Instruction>(InputInst->getOperand(0))->getOpcode() !=
-            cast<Instruction>(InputInst->getOperand(1))->getOpcode())
-          return true;
-
-        EVT Op1VT = EVT::getEVT(Op1->getType());
-        if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
-            ((VT.getVectorElementCount() * 2 ==
-              Op1VT.getVectorElementCount()) ||
-             (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount())))
-          return false;
-      }
-    } else if (ISD::isExtOpcode(Opcode)) {
-      return false;
-    }
-  }
-  return true;
-}
-
 TargetLowering::AtomicExpansionKind
 WebAssemblyTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   // We have wasm instructions for these
@@ -2113,106 +2083,6 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
                       MachinePointerInfo(SV));
 }
 
-// Try to lower partial.reduce.add to a dot or fallback to a sequence with
-// extmul and adds.
-SDValue performLowerPartialReduction(SDNode *N, SelectionDAG &DAG) {
-  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN);
-  if (N->getConstantOperandVal(0) != Intrinsic::vector_partial_reduce_add)
-    return SDValue();
-
-  assert(N->getValueType(0) == MVT::v4i32 && "can only support v4i32");
-  SDLoc DL(N);
-
-  SDValue Input = N->getOperand(2);
-  if (Input->getOpcode() == ISD::MUL) {
-    SDValue ExtendLHS = Input->getOperand(0);
-    SDValue ExtendRHS = Input->getOperand(1);
-    assert((ISD::isExtOpcode(ExtendLHS.getOpcode()) &&
-            ISD::isExtOpcode(ExtendRHS.getOpcode())) &&
-           "expected widening mul or add");
-    assert(ExtendLHS.getOpcode() == ExtendRHS.getOpcode() &&
-           "expected binop to use the same extend for both operands");
-
-    SDValue ExtendInLHS = ExtendLHS->getOperand(0);
-    SDValue ExtendInRHS = ExtendRHS->getOperand(0);
-    bool IsSigned = ExtendLHS->getOpcode() == ISD::SIGN_EXTEND;
-    unsigned LowOpc =
-        IsSigned ? WebAssemblyISD::EXTEND_LOW_S : WebAssemblyISD::EXTEND_LOW_U;
-    unsigned HighOpc = IsSigned ? WebAssemblyISD::EXTEND_HIGH_S
-                                : WebAssemblyISD::EXTEND_HIGH_U;
-    SDValue LowLHS;
-    SDValue LowRHS;
-    SDValue HighLHS;
-    SDValue HighRHS;
-
-    auto AssignInputs = [&](MVT VT) {
-      LowLHS = DAG.getNode(LowOpc, DL, VT, ExtendInLHS);
-      LowRHS = DAG.getNode(LowOpc, DL, VT, ExtendInRHS);
-      HighLHS = DAG.getNode(HighOpc, DL, VT, ExtendInLHS);
-      HighRHS = DAG.getNode(HighOpc, DL, VT, ExtendInRHS);
-    };
-
-    if (ExtendInLHS->getValueType(0) == MVT::v8i16) {
-      if (IsSigned) {
-        // i32x4.dot_i16x8_s
-        SDValue Dot = DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32,
-                                  ExtendInLHS, ExtendInRHS);
-        return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Dot);
-      }
-
-      // (add (add (extmul_low_sx lhs, rhs), (extmul_high_sx lhs, rhs)))
-      MVT VT = MVT::v4i32;
-      AssignInputs(VT);
-      SDValue MulLow = DAG.getNode(ISD::MUL, DL, VT, LowLHS, LowRHS);
-      SDValue MulHigh = DAG.getNode(ISD::MUL, DL, VT, HighLHS, HighRHS);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, MulLow, MulHigh);
-      return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Add);
-    } else {
-      assert(ExtendInLHS->getValueType(0) == MVT::v16i8 &&
-             "expected v16i8 input types");
-      AssignInputs(MVT::v8i16);
-      // Lower to a wider tree, using twice the operations compared to above.
-      if (IsSigned) {
-        // Use two dots
-        SDValue DotLHS =
-            DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, LowLHS, LowRHS);
-        SDValue DotRHS =
-            DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, HighLHS, HighRHS);
-        SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, DotLHS, DotRHS);
-        return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
-      }
-
-      SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS);
-      SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS);
-
-      SDValue AddLow = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL,
-                                   MVT::v4i32, MulLow);
-      SDValue AddHigh = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL,
-                                    MVT::v4i32, MulHigh);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh);
-      return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
-    }
-  } else {
-    // Accumulate the input using extadd_pairwise.
-    assert(ISD::isExtOpcode(Input.getOpcode()) && "expected extend");
-    bool IsSigned = Input->getOpcode() == ISD::SIGN_EXTEND;
-    unsigned PairwiseOpc = IsSigned ? WebAssemblyISD::EXT_ADD_PAIRWISE_S
-                                    : WebAssemblyISD::EXT_ADD_PAIRWISE_U;
-    SDValue ExtendIn = Input->getOperand(0);
-    if (ExtendIn->getValueType(0) == MVT::v8i16) {
-      SDValue Add = DAG.getNode(PairwiseOpc, DL, MVT::v4i32, ExtendIn);
-      return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
-    }
-
-    assert(ExtendIn->getValueType(0) == MVT::v16i8 &&
-           "expected v16i8 input types");
-    SDValue Add =
-        DAG.getNode(PairwiseOpc, DL, MVT::v4i32,
-                    DAG.getNode(PairwiseOpc, DL, MVT::v8i16, ExtendIn));
-    return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
-  }
-}
-
 SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -3683,11 +3553,8 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
     return performVectorTruncZeroCombine(N, DCI);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DCI);
-  case ISD::INTRINSIC_WO_CHAIN: {
-    if (auto AnyAllCombine = performAnyAllCombine(N, DCI.DAG))
-      return AnyAllCombine;
-    return performLowerPartialReduction(N, DCI.DAG);
-  }
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performAnyAllCombine(N, DCI.DAG);
   case ISD::MUL:
     return performMulCombine(N, DCI);
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 72401a7..b33a853 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -45,8 +45,6 @@ private:
   /// right decision when generating code for different targets.
   const WebAssemblySubtarget *Subtarget;
 
-  bool
-  shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
   AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
   bool shouldScalarizeBinop(SDValue VecOp) const override;
   FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
@@ -89,8 +87,7 @@ private:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context,
-                      const Type *RetTy) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index d8948ad..1306026 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1505,6 +1505,51 @@ defm Q15MULR_SAT_S :
   SIMDBinary<I16x8, int_wasm_q15mulr_sat_signed, "q15mulr_sat_s", 0x82>;
 
 //===----------------------------------------------------------------------===//
+// Partial reductions, using: dot, extmul and extadd_pairwise
+//===----------------------------------------------------------------------===//
+// MLA: v8i16 -> v4i32
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v8i16 V128:$lhs),
+                                                         (v8i16 V128:$rhs))),
+          (ADD_I32x4 (DOT $lhs, $rhs), $acc)>;
+def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v8i16 V128:$lhs),
+                                                         (v8i16 V128:$rhs))),
+          (ADD_I32x4 (ADD_I32x4 (EXTMUL_LOW_U_I32x4 $lhs, $rhs),
+                                (EXTMUL_HIGH_U_I32x4 $lhs, $rhs)),
+                     $acc)>;
+// MLA: v16i8 -> v4i32
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
+                                                         (v16i8 V128:$rhs))),
+          (ADD_I32x4 (ADD_I32x4 (DOT (extend_low_s_I16x8 $lhs),
+                                     (extend_low_s_I16x8 $rhs)),
+                                (DOT (extend_high_s_I16x8 $lhs),
+                                     (extend_high_s_I16x8 $rhs))),
+                      $acc)>;
+def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v16i8 V128:$lhs),
+                                                         (v16i8 V128:$rhs))),
+          (ADD_I32x4 (ADD_I32x4 (extadd_pairwise_u_I32x4 (EXTMUL_LOW_U_I16x8 $lhs, $rhs)),
+                                (extadd_pairwise_u_I32x4 (EXTMUL_HIGH_U_I16x8 $lhs, $rhs))),
+                     $acc)>;
+
+// Accumulate: v8i16 -> v4i32
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v8i16 V128:$in),
+                                                         (I16x8.splat (i32 1)))),
+          (ADD_I32x4 (extadd_pairwise_s_I32x4 $in), $acc)>;
+
+def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v8i16 V128:$in),
+                                                         (I16x8.splat (i32 1)))),
+          (ADD_I32x4 (extadd_pairwise_u_I32x4 $in), $acc)>;
+
+// Accumulate: v16i8 -> v4i32
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$in),
+                                                         (I8x16.splat (i32 1)))),
+          (ADD_I32x4 (extadd_pairwise_s_I32x4 (extadd_pairwise_s_I16x8 $in)),
+                     $acc)>;
+def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v16i8 V128:$in),
+                                                         (I8x16.splat (i32 1)))),
+          (ADD_I32x4 (extadd_pairwise_u_I32x4 (extadd_pairwise_u_I16x8 $in)),
+                     $acc)>;
+
+//===----------------------------------------------------------------------===//
 // Relaxed swizzle
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index ee1fec0..805bdb4 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1350,6 +1350,10 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU,
   BB->getTerminator()->eraseFromParent();
   SwitchInst *SI = IRB.CreateSwitch(
       IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
+  // We can't know the precise weights here, as they would depend on the value
+  // distribution of Call->getArgOperand(1). So we just mark it as "unknown".
+  setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(),
+                                              DEBUG_TYPE);
   Type *IndexTy = DL.getIndexType(Call->getType());
   SmallVector<DominatorTree::UpdateType, 8> Updates;
 
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b988957..cf076b9a 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5810,10 +5810,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx512_vpdpbusds_512:
     case Intrinsic::x86_avx2_vpdpbssd_128:
     case Intrinsic::x86_avx2_vpdpbssd_256:
+    case Intrinsic::x86_avx10_vpdpbssd_512:
     case Intrinsic::x86_avx2_vpdpbssds_128:
     case Intrinsic::x86_avx2_vpdpbssds_256:
-    case Intrinsic::x86_avx10_vpdpbssd_512:
     case Intrinsic::x86_avx10_vpdpbssds_512:
+    case Intrinsic::x86_avx2_vpdpbsud_128:
+    case Intrinsic::x86_avx2_vpdpbsud_256:
+    case Intrinsic::x86_avx10_vpdpbsud_512:
+    case Intrinsic::x86_avx2_vpdpbsuds_128:
+    case Intrinsic::x86_avx2_vpdpbsuds_256:
+    case Intrinsic::x86_avx10_vpdpbsuds_512:
+    case Intrinsic::x86_avx2_vpdpbuud_128:
+    case Intrinsic::x86_avx2_vpdpbuud_256:
+    case Intrinsic::x86_avx10_vpdpbuud_512:
+    case Intrinsic::x86_avx2_vpdpbuuds_128:
+    case Intrinsic::x86_avx2_vpdpbuuds_256:
+    case Intrinsic::x86_avx10_vpdpbuuds_512:
       handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
       break;
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ab5c9c9..5a08e4d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1762,9 +1762,10 @@ public:
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
                     const DataLayout &DL, TTI::TargetCostKind CostKind)
-      : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
-        MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
-        CostKind(CostKind) {}
+      : DT(DT), LI(LI), TTI(TTI),
+        SCEVExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
+        MemCheckExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
+        PSE(PSE), CostKind(CostKind) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
@@ -7486,12 +7487,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     VPSingleDefRecipe *VectorPtr;
     if (Reverse) {
       // When folding the tail, we may compute an address that we don't in the
-      // original scalar loop and it may not be inbounds. Drop Inbounds in that
-      // case.
+      // original scalar loop: drop the GEP no-wrap flags in this case.
+      // Otherwise preserve existing flags without no-unsigned-wrap, as we will
+      // emit negative indices.
       GEPNoWrapFlags Flags =
-          (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
+          CM.foldTailByMasking() || !GEP
               ? GEPNoWrapFlags::none()
-              : GEPNoWrapFlags::inBounds();
+              : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
       VectorPtr =
           new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
                                        /*Stride*/ -1, Flags, I->getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c547662..f77d587 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2105,6 +2105,7 @@ public:
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
     ValueToGatherNodes.clear();
+    TreeEntryToStridedPtrInfoMap.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -8948,6 +8949,8 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         const SmallDenseSet<Value *> &UserIgnoreLst) {
   deleteTree();
+  assert(TreeEntryToStridedPtrInfoMap.empty() &&
+         "TreeEntryToStridedPtrInfoMap is not cleared");
   UserIgnoreList = &UserIgnoreLst;
   if (!allSameType(Roots))
     return;
@@ -8956,6 +8959,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
   deleteTree();
+  assert(TreeEntryToStridedPtrInfoMap.empty() &&
+         "TreeEntryToStridedPtrInfoMap is not cleared");
   if (!allSameType(Roots))
     return;
   buildTreeRec(Roots, 0, EdgeInfo());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0822511..10d704d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2997,6 +2997,10 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
     /// vector operands, performing a reduction.add on the result, and adding
     /// the scalar result to a chain.
     MulAccReduction,
+    /// Represent an inloop multiply-accumulate reduction, multiplying the
+    /// extended vector operands, negating the multiplication, performing a
+    /// reduction.add on the result, and adding the scalar result to a chain.
+    ExtNegatedMulAccReduction,
   };
 
   /// Type of the expression.
@@ -3020,6 +3024,19 @@ public:
                      VPWidenRecipe *Mul, VPReductionRecipe *Red)
       : VPExpressionRecipe(ExpressionTypes::ExtMulAccReduction,
                            {Ext0, Ext1, Mul, Red}) {}
+  VPExpressionRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+                     VPWidenRecipe *Mul, VPWidenRecipe *Sub,
+                     VPReductionRecipe *Red)
+      : VPExpressionRecipe(ExpressionTypes::ExtNegatedMulAccReduction,
+                           {Ext0, Ext1, Mul, Sub, Red}) {
+    assert(Mul->getOpcode() == Instruction::Mul && "Expected a mul");
+    assert(Red->getRecurrenceKind() == RecurKind::Add &&
+           "Expected an add reduction");
+    assert(getNumOperands() >= 3 && "Expected at least three operands");
+    [[maybe_unused]] auto *SubConst = dyn_cast<ConstantInt>(getOperand(2)->getLiveInIRValue());
+    assert(SubConst && SubConst->getValue() == 0 &&
+           Sub->getOpcode() == Instruction::Sub && "Expected a negating sub");
+  }
 
   ~VPExpressionRecipe() override {
     for (auto *R : reverse(ExpressionRecipes))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b5e30cb..ee03729 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2839,12 +2839,17 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
     return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
                                           Ctx.CostKind);
 
-  case ExpressionTypes::ExtMulAccReduction:
+  case ExpressionTypes::ExtNegatedMulAccReduction:
+    assert(Opcode == Instruction::Add && "Unexpected opcode");
+    Opcode = Instruction::Sub;
+    LLVM_FALLTHROUGH;
+  case ExpressionTypes::ExtMulAccReduction: {
     return Ctx.TTI.getMulAccReductionCost(
         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
             Instruction::ZExt,
         Opcode, RedTy, SrcVecTy, Ctx.CostKind);
   }
+  }
   llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
 }
 
@@ -2890,6 +2895,30 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
     O << ")";
     break;
   }
+  case ExpressionTypes::ExtNegatedMulAccReduction: {
+    getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
+    O << " + reduce."
+      << Instruction::getOpcodeName(
+             RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
+      << " (sub (0, mul";
+    auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
+    Mul->printFlags(O);
+    O << "(";
+    getOperand(0)->printAsOperand(O, SlotTracker);
+    auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+    O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
+      << *Ext0->getResultType() << "), (";
+    getOperand(1)->printAsOperand(O, SlotTracker);
+    auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
+    O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
+      << *Ext1->getResultType() << ")";
+    if (Red->isConditional()) {
+      O << ", ";
+      Red->getCondOp()->printAsOperand(O, SlotTracker);
+    }
+    O << "))";
+    break;
+  }
   case ExpressionTypes::MulAccReduction:
   case ExpressionTypes::ExtMulAccReduction: {
     getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5252e1f..969dce4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3543,7 +3543,15 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
   };
 
   VPValue *VecOp = Red->getVecOp();
+  VPRecipeBase *Sub = nullptr;
   VPValue *A, *B;
+  VPValue *Tmp = nullptr;
+  // Sub reductions could have a sub between the add reduction and vec op.
+  if (match(VecOp,
+            m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Tmp)))) {
+    Sub = VecOp->getDefiningRecipe();
+    VecOp = Tmp;
+  }
   // Try to match reduce.add(mul(...)).
   if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
     auto *RecipeA =
@@ -3560,12 +3568,21 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
                                        Instruction::CastOps::ZExt,
                                    Mul, RecipeA, RecipeB, nullptr)) {
+      if (Sub)
+        return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
+                                      cast<VPWidenRecipe>(Sub), Red);
       return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
     }
     // Match reduce.add(mul).
-    if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
+    // TODO: Add an expression type for this variant with a negated mul
+    if (!Sub &&
+        IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
       return new VPExpressionRecipe(Mul, Red);
   }
+  // TODO: Add an expression type for negated versions of other expression
+  // variants.
+  if (Sub)
+    return nullptr;
   // Match reduce.add(ext(mul(ext(A), ext(B)))).
   // All extend recipes must have same opcode or A == B
   // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 32704bd..d6eb00d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1031,6 +1031,16 @@ bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
   // Create the cast operation directly to ensure we get a new instruction
   Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
 
+  // Preserve cast instruction flags
+  if (RHSFlags.NNeg)
+    NewCast->setNonNeg();
+  if (RHSFlags.NUW)
+    NewCast->setHasNoUnsignedWrap();
+  if (RHSFlags.NSW)
+    NewCast->setHasNoSignedWrap();
+
+  NewCast->andIRFlags(LHSCast);
+
   // Insert the new instruction
   Value *Result = Builder.Insert(NewCast);
 
diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
index 5e01612..b89f551 100644
--- a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
+++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
@@ -310,6 +310,187 @@ define void @test_2x32bit_mask_with_32bit_index_and_trip_count(i32 %i, i32 %n) #
   ret void
 }
 
+; Extra use of the get_active_lane_mask from an extractelement, which is replaced with ptest_first.
+
+define void @test_2x8bit_mask_with_extracts_and_ptest(i64 %i, i64 %n) {
+; CHECK-SVE-LABEL: test_2x8bit_mask_with_extracts_and_ptest:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilelo p1.b, x0, x1
+; CHECK-SVE-NEXT:    b.pl .LBB11_2
+; CHECK-SVE-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    b use
+; CHECK-SVE-NEXT:  .LBB11_2: // %if.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-SME2-LABEL: test_2x8bit_mask_with_extracts_and_ptest:
+; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.h, p1.h }, x0, x1
+; CHECK-SVE2p1-SME2-NEXT:    ptrue p2.b
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p3.b, p0.b, p1.b
+; CHECK-SVE2p1-SME2-NEXT:    ptest p2, p3.b
+; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB11_2
+; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    b use
+; CHECK-SVE2p1-SME2-NEXT:  .LBB11_2: // %if.end
+; CHECK-SVE2p1-SME2-NEXT:    ret
+entry:
+    %r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i64 %i, i64 %n)
+    %v0 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
+    %v1 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
+    %elt0 = extractelement <vscale x 16 x i1> %r, i32 0
+    br i1 %elt0, label %if.then, label %if.end
+
+if.then:
+    tail call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
+    br label %if.end
+
+if.end:
+    ret void
+}
+
+; Extra use of the get_active_lane_mask from an extractelement, which is
+; replaced with ptest_first and reinterpret_casts because the extract is not nxv16i1.
+
+define void @test_2x8bit_mask_with_extracts_and_reinterpret_casts(i64 %i, i64 %n) {
+; CHECK-SVE-LABEL: test_2x8bit_mask_with_extracts_and_reinterpret_casts:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilelo p1.h, x0, x1
+; CHECK-SVE-NEXT:    b.pl .LBB12_2
+; CHECK-SVE-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    b use
+; CHECK-SVE-NEXT:  .LBB12_2: // %if.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-SME2-LABEL: test_2x8bit_mask_with_extracts_and_reinterpret_casts:
+; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.s, p1.s }, x0, x1
+; CHECK-SVE2p1-SME2-NEXT:    ptrue p2.h
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p3.h, p0.h, p1.h
+; CHECK-SVE2p1-SME2-NEXT:    ptest p2, p3.b
+; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB12_2
+; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    b use
+; CHECK-SVE2p1-SME2-NEXT:  .LBB12_2: // %if.end
+; CHECK-SVE2p1-SME2-NEXT:    ret
+entry:
+    %r = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %i, i64 %n)
+    %v0 = tail call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> %r, i64 0)
+    %v1 = tail call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> %r, i64 4)
+    %elt0 = extractelement <vscale x 8 x i1> %r, i64 0
+    br i1 %elt0, label %if.then, label %if.end
+
+if.then:
+    tail call void @use(<vscale x 4 x i1> %v0, <vscale x 4 x i1> %v1)
+    br label %if.end
+
+if.end:
+    ret void
+}
+
+define void @test_4x4bit_mask_with_extracts_and_ptest(i64 %i, i64 %n) {
+; CHECK-SVE-LABEL: test_4x4bit_mask_with_extracts_and_ptest:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilelo p0.b, x0, x1
+; CHECK-SVE-NEXT:    b.pl .LBB13_2
+; CHECK-SVE-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE-NEXT:    punpklo p1.h, p0.b
+; CHECK-SVE-NEXT:    punpkhi p3.h, p0.b
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    punpklo p2.h, p3.b
+; CHECK-SVE-NEXT:    punpkhi p3.h, p3.b
+; CHECK-SVE-NEXT:    b use
+; CHECK-SVE-NEXT:  .LBB13_2: // %if.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-SME2-LABEL: test_4x4bit_mask_with_extracts_and_ptest:
+; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
+; CHECK-SVE2p1-SME2-NEXT:    cnth x8
+; CHECK-SVE2p1-SME2-NEXT:    adds x8, x0, x8
+; CHECK-SVE2p1-SME2-NEXT:    csinv x8, x8, xzr, lo
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.s, p1.s }, x0, x1
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p2.s, p3.s }, x8, x1
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.h, p0.h, p1.h
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p5.h, p2.h, p3.h
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.b, p4.b, p5.b
+; CHECK-SVE2p1-SME2-NEXT:    ptrue p5.b
+; CHECK-SVE2p1-SME2-NEXT:    ptest p5, p4.b
+; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB13_2
+; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    b use
+; CHECK-SVE2p1-SME2-NEXT:  .LBB13_2: // %if.end
+; CHECK-SVE2p1-SME2-NEXT:    ret
+entry:
+    %r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i64 %i, i64 %n)
+    %v0 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
+    %v1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 4)
+    %v2 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
+    %v3 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 12)
+    %elt0 = extractelement <vscale x 16 x i1> %r, i32 0
+    br i1 %elt0, label %if.then, label %if.end
+
+if.then:
+    tail call void @use(<vscale x 4 x i1> %v0, <vscale x 4 x i1> %v1, <vscale x 4 x i1> %v2, <vscale x 4 x i1> %v3)
+    br label %if.end
+
+if.end:
+    ret void
+}
+
+define void @test_4x2bit_mask_with_extracts_and_reinterpret_casts(i64 %i, i64 %n) {
+; CHECK-SVE-LABEL: test_4x2bit_mask_with_extracts_and_reinterpret_casts:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilelo p0.h, x0, x1
+; CHECK-SVE-NEXT:    b.pl .LBB14_2
+; CHECK-SVE-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE-NEXT:    punpklo p1.h, p0.b
+; CHECK-SVE-NEXT:    punpkhi p3.h, p0.b
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    punpklo p2.h, p3.b
+; CHECK-SVE-NEXT:    punpkhi p3.h, p3.b
+; CHECK-SVE-NEXT:    b use
+; CHECK-SVE-NEXT:  .LBB14_2: // %if.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-SME2-LABEL: test_4x2bit_mask_with_extracts_and_reinterpret_casts:
+; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
+; CHECK-SVE2p1-SME2-NEXT:    cntw x8
+; CHECK-SVE2p1-SME2-NEXT:    adds x8, x0, x8
+; CHECK-SVE2p1-SME2-NEXT:    csinv x8, x8, xzr, lo
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.d, p1.d }, x0, x1
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p2.d, p3.d }, x8, x1
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.s, p0.s, p1.s
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p5.s, p2.s, p3.s
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.h, p4.h, p5.h
+; CHECK-SVE2p1-SME2-NEXT:    ptrue p5.h
+; CHECK-SVE2p1-SME2-NEXT:    ptest p5, p4.b
+; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB14_2
+; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    b use
+; CHECK-SVE2p1-SME2-NEXT:  .LBB14_2: // %if.end
+; CHECK-SVE2p1-SME2-NEXT:    ret
+entry:
+    %r = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i64 %i, i64 %n)
+    %v0 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 0)
+    %v1 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 2)
+    %v2 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 4)
+    %v3 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 6)
+    %elt0 = extractelement <vscale x 8 x i1> %r, i32 0
+    br i1 %elt0, label %if.then, label %if.end
+
+if.then:
+    tail call void @use(<vscale x 2 x i1> %v0, <vscale x 2 x i1> %v1, <vscale x 2 x i1> %v2, <vscale x 2 x i1> %v3)
+    br label %if.end
+
+if.end:
+    ret void
+}
+
 declare void @use(...)
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll
index e68539b..e8bbaf9 100644
--- a/llvm/test/CodeGen/AArch64/isinf.ll
+++ b/llvm/test/CodeGen/AArch64/isinf.ll
@@ -27,9 +27,8 @@ define i32 @replace_isinf_call_f32(float %x) {
 ; CHECK-LABEL: replace_isinf_call_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
-; CHECK-NEXT:    and w9, w9, #0x7fffffff
-; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    mov w8, #-16777216 // =0xff000000
+; CHECK-NEXT:    cmp w8, w9, lsl #1
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %abs = tail call float @llvm.fabs.f32(float %x)
@@ -43,9 +42,8 @@ define i32 @replace_isinf_call_f64(double %x) {
 ; CHECK-LABEL: replace_isinf_call_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    mov x8, #9218868437227405312 // =0x7ff0000000000000
-; CHECK-NEXT:    and x9, x9, #0x7fffffffffffffff
-; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #-9007199254740992 // =0xffe0000000000000
+; CHECK-NEXT:    cmp x8, x9, lsl #1
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %abs = tail call double @llvm.fabs.f64(double %x)
diff --git a/llvm/test/CodeGen/AArch64/masked-integer-compare.ll b/llvm/test/CodeGen/AArch64/masked-integer-compare.ll
new file mode 100644
index 0000000..363cd10
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/masked-integer-compare.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
+
+; Test code generation support for SUBS (shifted register) from masked integer
+; compare sequences. These sequences appear in isinf tests, for example.
+
+define i1 @combine_masked_i32(i32 %x) {
+; CHECK-LABEL: combine_masked_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-16777216 // =0xff000000
+; CHECK-NEXT:    cmp w8, w0, lsl #1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %sub = sub i32 %and, u0x7f800000
+  %cmp = icmp eq i32 %sub, 0
+  ret i1 %cmp
+}
+
+define i1 @combine_masked_i64(i64 %x) {
+; CHECK-LABEL: combine_masked_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #-9007199254740992 // =0xffe0000000000000
+; CHECK-NEXT:    cmp x8, x0, lsl #1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i64 %x, u0x7fffffffffffffff
+  %sub = sub i64 %and, u0x7ff0000000000000
+  %cmp = icmp eq i64 %sub, 0
+  ret i1 %cmp
+}
+
+define i1 @combine_masked_ne(i32 %x) {
+; CHECK-LABEL: combine_masked_ne:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-16777216 // =0xff000000
+; CHECK-NEXT:    cmp w8, w0, lsl #1
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %cmp = icmp ne i32 %and, u0x7f800000
+  ret i1 %cmp
+}
+
+define i1 @combine_masked_lsl4(i32 %x) {
+; CHECK-LABEL: combine_masked_lsl4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-134217728 // =0xf8000000
+; CHECK-NEXT:    cmp w8, w0, lsl #4
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x0fffffff
+  %cmp = icmp eq i32 %and, u0x0f800000
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_not_mask(i32 %x) {
+; CHECK-LABEL: dont_combine_not_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w9, w0, #0x7ffffffe
+; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7ffffffe
+  %cmp = icmp eq i32 %and, u0x7f800000
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_cmp_not_masked(i32 %x) {
+; CHECK-LABEL: dont_combine_cmp_not_masked:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w9, w0, #0x3fffffff
+; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x3fffffff
+  %cmp = icmp eq i32 %and, u0x7f800000
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_not_constant_mask(i32 %x, i32 %m) {
+; CHECK-LABEL: dont_combine_not_constant_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w9, w0, w1
+; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, %m
+  %cmp = icmp eq i32 %and, u0x7f800000
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_not_constant_cmp(i32 %x, i32 %c) {
+; CHECK-LABEL: dont_combine_not_constant_cmp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xfffffff
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x0fffffff
+  %cmp = icmp eq i32 %and, %c
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_subs_imm(i32 %x) {
+; CHECK-LABEL: dont_combine_subs_imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0x7fffffff
+; CHECK-NEXT:    cmp w8, #291
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %cmp = icmp eq i32 %and, u0x123
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_subs_imm_lsl12(i32 %x) {
+; CHECK-LABEL: dont_combine_subs_imm_lsl12:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0x7fffffff
+; CHECK-NEXT:    cmp w8, #291, lsl #12 // =1191936
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %cmp = icmp eq i32 %and, u0x123000
+  ret i1 %cmp
+}
+
+define { i1, i1 } @dont_combine_multi_use_cmp(i32 %x) {
+; CHECK-LABEL: dont_combine_multi_use_cmp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w9, w0, #0x7fffffff
+; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    cset w1, lt
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %eq = icmp eq i32 %and, u0x7f800000
+  %lt = icmp slt i32 %and, u0x7f800000
+  %r1 = insertvalue { i1, i1 } poison, i1 %eq, 0
+  %r2 = insertvalue { i1, i1 } %r1, i1 %lt, 1
+  ret { i1, i1 } %r2
+}
+
+define { i32, i1 } @dont_combine_multi_use_sub(i32 %x) {
+; CHECK-LABEL: dont_combine_multi_use_sub:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2139095040 // =0x80800000
+; CHECK-NEXT:    and w9, w0, #0x7fffffff
+; CHECK-NEXT:    adds w0, w9, w8
+; CHECK-NEXT:    cset w1, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %sub = sub i32 %and, u0x7f800000
+  %cmp = icmp eq i32 %sub, 0
+  %r1 = insertvalue { i32, i1 } poison, i32 %sub, 0
+  %r2 = insertvalue { i32, i1 } %r1, i1 %cmp, 1
+  ret { i32, i1 } %r2
+}
+
+define { i32, i1 } @dont_combine_multi_use_and(i32 %x) {
+; CHECK-LABEL: dont_combine_multi_use_and:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w0, w0, #0x7fffffff
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    cset w1, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %cmp = icmp eq i32 %and, u0x7f800000
+  %r1 = insertvalue { i32, i1 } poison, i32 %and, 0
+  %r2 = insertvalue { i32, i1 } %r1, i1 %cmp, 1
+  ret { i32, i1 } %r2
+}
diff --git a/llvm/test/CodeGen/AArch64/seh-minimal-prologue-epilogue.ll b/llvm/test/CodeGen/AArch64/seh-minimal-prologue-epilogue.ll
new file mode 100644
index 0000000..cc71b8b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/seh-minimal-prologue-epilogue.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=aarch64-windows %s -o - | FileCheck %s
+
+; This test verifies that functions requiring Windows CFI that have minimal
+; or no prologue instructions still emit proper SEH directives, specifically
+; ensuring .seh_endprologue is emitted before .seh_startepilogue.
+;
+; This reproduces the issue where Swift async functions with swifttailcc
+; calling convention would fail with:
+; "error: starting epilogue (.seh_startepilogue) before prologue has ended (.seh_endprologue)"
+
+; Test 1: Swift-style tail call function with minimal prologue
+define swifttailcc void @test_swifttailcc_minimal(ptr %async_ctx, ptr %arg1, ptr %arg2) {
+; CHECK-LABEL: test_swifttailcc_minimal:
+; CHECK-NOT:   .seh_proc test_swifttailcc_minimal
+; CHECK-NOT:   .seh_endprologue
+; CHECK-NOT:   .seh_startepilogue
+; CHECK-NOT:   .seh_endepilogue
+; CHECK-NOT:   .seh_endproc
+entry:
+  %ptr1 = getelementptr inbounds i8, ptr %async_ctx, i64 16
+  %ptr2 = getelementptr inbounds i8, ptr %async_ctx, i64 24
+  store ptr %arg1, ptr %ptr1, align 8
+  store ptr %arg2, ptr %ptr2, align 8
+  musttail call swifttailcc void @external_swift_function(ptr %async_ctx, ptr %arg1)
+  ret void
+}
+
+; Test 2: Function similar to the original failing case
+define linkonce_odr hidden swifttailcc void @test_linkonce_swifttailcc(ptr swiftasync %async_ctx, ptr %arg1, ptr noalias dereferenceable(40) %arg2, ptr %arg3, i64 %value, ptr %arg4, ptr %arg5, ptr %arg6, i1 %flag, ptr %arg7, ptr noalias dereferenceable(40) %arg8) {
+; CHECK-LABEL: test_linkonce_swifttailcc:
+; CHECK-NEXT:  .seh_proc
+; CHECK:       .seh_endprologue
+; CHECK:       .seh_startepilogue
+; CHECK:       .seh_endepilogue
+; CHECK:       .seh_endproc
+entry:
+  %frame_ptr = getelementptr inbounds nuw i8, ptr %async_ctx, i64 16
+  %ctx1 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 400
+  %ctx2 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 1168
+  %spill1 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 2392
+  store ptr %arg8, ptr %spill1, align 8
+  %spill2 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 2384
+  store ptr %arg7, ptr %spill2, align 8
+  %spill3 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 2225
+  store i1 %flag, ptr %spill3, align 1
+  %spill4 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 2376
+  store ptr %arg6, ptr %spill4, align 8
+  musttail call swifttailcc void @external_swift_continuation(ptr swiftasync %async_ctx, i64 0, i64 0)
+  ret void
+}
+
+declare swifttailcc void @external_swift_function(ptr, ptr)
+declare swifttailcc void @external_swift_continuation(ptr swiftasync, i64, i64)
diff --git a/llvm/test/CodeGen/AArch64/wincfi-seh-only-in-epilogue.ll b/llvm/test/CodeGen/AArch64/wincfi-minimal-seh-prologue.ll
index 7daceae..8308108 100644
--- a/llvm/test/CodeGen/AArch64/wincfi-seh-only-in-epilogue.ll
+++ b/llvm/test/CodeGen/AArch64/wincfi-minimal-seh-prologue.ll
@@ -5,8 +5,9 @@ entry:
   ret void
 }
 
-; Check that there is no .seh_endprologue but there is seh_startepilogue/seh_endepilogue.
-; CHECK-NOT: .seh_endprologue
+; Check that there is a minimal SEH prologue with seh_startepilogue/seh_endepilogue.
+; CHECK:     .seh_proc test
+; CHECK:     .seh_endprologue
 ; CHECK:     .seh_startepilogue
 ; CHECK:     add sp, sp, #48
 ; CHECK:     .seh_stackalloc 48
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
index feaf7ce..434f763 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12 %s
 
 ; Natural mapping
 define amdgpu_ps float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll
index 636ba9b..41d4553 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
 
 ; Natural mapping
 define amdgpu_ps float @raw_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll
index 4d7d3ec..8ad5f50 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck %s
 
 define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
   ; CHECK-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
index 0ae2833..b7e2074 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
   ; GFX10_GFX11-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll
index d644ef9..23858b9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
 
 define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   ; CHECK-LABEL: name: struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
index 7c811f4..dec015d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK-GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK-GFX12 %s
 
 define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   ; CHECK-LABEL: name: struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index ed48999..bd28f72 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -1,734 +1,759 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI %s
 
-; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI %s
 
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v1
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-NNAN-TRUE16:       ; %bb.0:
-; GFX11-NNAN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
-; GFX11-NNAN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-NNAN-FAKE16:       ; %bb.0:
-; GFX11-NNAN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NNAN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt half %a, %b
   %val = select i1 %cmp, half %a, half %b
   ret half %val
 }
 
+define half @test_fmax_legacy_ugt_f16_fast(half %a, half %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16_fast:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_f16_fast:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt half %a, %b
+  %val = select nnan nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
 define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v2, v0, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v2
-; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v3
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v1.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.l, v1.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
+; VI-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; SI-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <2 x half> %a, %b
   %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
 }
 
+define <2 x half> @test_fmax_legacy_ugt_v2f16_fast(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_v2f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v2f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v2f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v2
+; SI-NEXT:    v_max_f32_e32 v1, v1, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmax_legacy_ugt_v2f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt <2 x half> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
 define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v3, v0
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v4, v1
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v2, v5, v2
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v3
-; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v4
-; SI-NNAN-NEXT:    v_max_f32_e32 v2, v2, v5
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v2.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.l, v2.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v1.l, v3.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v3f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v3f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v3, v0
+; SI-NEXT:    v_max_legacy_f32_e32 v1, v4, v1
+; SI-NEXT:    v_max_legacy_f32_e32 v2, v5, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <3 x half> %a, %b
   %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
   ret <3 x half> %val
 }
 
+define <3 x half> @test_fmax_legacy_ugt_v3f16_fast(<3 x half> %a, <3 x half> %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_v3f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v3f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v2
+; VI-NEXT:    v_max_f16_e32 v1, v1, v3
+; VI-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v3f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v3
+; SI-NEXT:    v_max_f32_e32 v1, v1, v4
+; SI-NEXT:    v_max_f32_e32 v2, v2, v5
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmax_legacy_ugt_v3f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt <3 x half> %a, %b
+  %val = select nnan nsz <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
+  ret <3 x half> %val
+}
+
 define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v1, v6, v1, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v4
-; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v5
-; SI-NNAN-NEXT:    v_max_f32_e32 v2, v2, v6
-; SI-NNAN-NEXT:    v_max_f32_e32 v3, v3, v7
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1.h, v3.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.h, v2.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v0.l, v2.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, v1.l, v3.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v7, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; SI-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; SI-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; SI-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v7, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
 }
 
+define <4 x half> @test_fmax_legacy_ugt_v4f16_fast(<4 x half> %a, <4 x half> %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_v4f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v4f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v1, v1, v3
+; VI-NEXT:    v_max_f16_e32 v0, v0, v2
+; VI-NEXT:    v_or_b32_e32 v0, v0, v5
+; VI-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v4f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v4
+; SI-NEXT:    v_max_f32_e32 v1, v1, v5
+; SI-NEXT:    v_max_f32_e32 v2, v2, v6
+; SI-NEXT:    v_max_f32_e32 v3, v3, v7
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmax_legacy_ugt_v4f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt <4 x half> %a, %b
+  %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
 define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v7
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v2, v6
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v5
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v8, v0, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v1, v10, v1, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v2, v12, v2, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v3, v14, v3, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v4
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v2, v2, v6
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v3, v3, v7
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v7
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v2, v6
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v5
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
-; VI-SAFE-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
-; VI-SAFE-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v3, v3, v7
-; VI-NNAN-NEXT:    v_max_f16_e32 v2, v2, v6
-; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v5
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v8, v0
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v9, v1
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v2, v10, v2
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v3, v11, v3
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v4, v12, v4
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v5, v13, v5
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v6, v14, v6
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v7, v15, v7
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v8
-; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v9
-; SI-NNAN-NEXT:    v_max_f32_e32 v2, v2, v10
-; SI-NNAN-NEXT:    v_max_f32_e32 v3, v3, v11
-; SI-NNAN-NEXT:    v_max_f32_e32 v4, v4, v12
-; SI-NNAN-NEXT:    v_max_f32_e32 v5, v5, v13
-; SI-NNAN-NEXT:    v_max_f32_e32 v6, v6, v14
-; SI-NNAN-NEXT:    v_max_f32_e32 v7, v7, v15
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v4.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v1.h, v5.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v2.h, v6.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, v3.h, v7.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s3, v0.l, v4.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s4, v1.l, v5.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s5, v2.l, v6.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s6, v3.l, v7.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.h, v3.h, s2
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.h, v2.h, s1
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v2.l, v6.l, v2.l, s5
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v3.l, v7.l, v3.l, s6
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v11, v10
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v13, v12
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v15, v14
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v9, v8
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v2, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v4
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v5
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v7
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v4
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v2, v2, v6
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v3, v3, v7
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v8, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v10, v1, s4
+; GFX9-NEXT:    v_perm_b32 v2, v12, v2, s4
+; GFX9-NEXT:    v_perm_b32 v3, v14, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v8f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; VI-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
+; VI-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
+; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v7
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v2, v6
+; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v5
+; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v8f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v8, v0
+; SI-NEXT:    v_max_legacy_f32_e32 v1, v9, v1
+; SI-NEXT:    v_max_legacy_f32_e32 v2, v10, v2
+; SI-NEXT:    v_max_legacy_f32_e32 v3, v11, v3
+; SI-NEXT:    v_max_legacy_f32_e32 v4, v12, v4
+; SI-NEXT:    v_max_legacy_f32_e32 v5, v13, v5
+; SI-NEXT:    v_max_legacy_f32_e32 v6, v14, v6
+; SI-NEXT:    v_max_legacy_f32_e32 v7, v15, v7
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v1.h, v5.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v2.h, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, v3.h, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s3, v0.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s4, v1.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s5, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s6, v3.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.h, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v6.l, v2.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v7.l, v3.l, s6
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v11, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v9, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v2, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <8 x half> %a, %b
   %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
   ret <8 x half> %val
 }
 
+define <8 x half> @test_fmax_legacy_ugt_v8f16_fast(<8 x half> %a, <8 x half> %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_v8f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v6
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v8f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v3, v3, v7
+; VI-NEXT:    v_max_f16_e32 v2, v2, v6
+; VI-NEXT:    v_max_f16_e32 v1, v1, v5
+; VI-NEXT:    v_max_f16_e32 v0, v0, v4
+; VI-NEXT:    v_or_b32_e32 v0, v0, v11
+; VI-NEXT:    v_or_b32_e32 v1, v1, v10
+; VI-NEXT:    v_or_b32_e32 v2, v2, v9
+; VI-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v8f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v8
+; SI-NEXT:    v_max_f32_e32 v1, v1, v9
+; SI-NEXT:    v_max_f32_e32 v2, v2, v10
+; SI-NEXT:    v_max_f32_e32 v3, v3, v11
+; SI-NEXT:    v_max_f32_e32 v4, v4, v12
+; SI-NEXT:    v_max_f32_e32 v5, v5, v13
+; SI-NEXT:    v_max_f32_e32 v6, v6, v14
+; SI-NEXT:    v_max_f32_e32 v7, v7, v15
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmax_legacy_ugt_v8f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v6
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt <8 x half> %a, %b
+  %val = select nnan nsz <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %val
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
index eee2bd1..f3a84e6 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -1,8 +1,6 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN,FUNC %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,FUNC %s
 
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s
 
@@ -12,12 +10,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 
-; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
-
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; VI: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_uge_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
@@ -34,18 +30,38 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_uge_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
+
+  %cmp = fcmp uge float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
 
-; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; VI: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
 
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
@@ -64,16 +80,40 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src(ptr addrspace(1) %o
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]]
+; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]]
+
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
+
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+
+  %cmp = fcmp uge float %a.nnan, %b.nnan
+  %val = select nnan nsz i1 %cmp, float %a.nnan, float %b.nnan
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 
-; VI-SAFE: v_cmp_ge_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+; VI: v_cmp_ge_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_oge_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -89,17 +129,35 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32:
+; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_oge_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %cmp = fcmp oge float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI: v_cmp_nle_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ugt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -115,16 +173,35 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_ugt_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
+
+  %cmp = fcmp ugt float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+; VI: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -140,17 +217,35 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %cmp = fcmp ogt float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -166,23 +261,39 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <1 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile <1 x float>, ptr addrspace(1) %gep.0
+  %b = load volatile <1 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ogt <1 x float> %a, %b
+  %val = select nnan nsz <1 x i1> %cmp, <1 x float> %a, <1 x float> %b
+  store <1 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32:
-; SI-SAFE: v_max_legacy_f32_e32
-; SI-SAFE: v_max_legacy_f32_e32
-; SI-SAFE: v_max_legacy_f32_e32
-
-; VI-SAFE: v_cmp_gt_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_gt_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_gt_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE-NOT: v_cmp
-; VI-SAFE-NOT: v_cndmask
-
-; GCN-NONAN: v_max_f32_e32
-; GCN-NONAN: v_max_f32_e32
-; GCN-NONAN: v_max_f32_e32
+; SI: v_max_legacy_f32_e32
+; SI: v_max_legacy_f32_e32
+; SI: v_max_legacy_f32_e32
+
+; VI: v_cmp_gt_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_gt_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_gt_f32_e32
+; VI: v_cndmask_b32_e32
+; VI-NOT: v_cmp
+; VI-NOT: v_cndmask
 
 ; GCN-NOT: v_max
 define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
@@ -199,6 +310,27 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32_fast:
+
+; GCN: v_max_f32_e32
+; GCN: v_max_f32_e32
+; GCN: v_max_f32_e32
+
+; GCN-NOT: v_max
+define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <3 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load <3 x float>, ptr addrspace(1) %gep.0
+  %b = load <3 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ogt <3 x float> %a, %b
+  %val = select nnan nsz <3 x i1> %cmp, <3 x float> %a, <3 x float> %b
+  store <3 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_multi_use:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
index 2ac5891..37f077d5 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
@@ -1,16 +1,12 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-NNAN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s
 
 ; GCN-LABEL: {{^}}min_fneg_select_regression_0:
 ; GCN-NOT: v_mul
 
-; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
-
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
+; VI: v_cmp_nle_f32_e32 vcc, 1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
 define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ult float %a, 1.0
@@ -18,15 +14,23 @@ define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}min_fneg_select_regression_0_fast:
+; GCN-NOT: v_mul
+
+define amdgpu_ps float @min_fneg_select_regression_0_fast(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ult float %a, 1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
 ; GCN-NOT: v_mul
 
 ; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
 
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NNAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
+; VI: v_cmp_nle_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ult float %a, -1.0
@@ -34,15 +38,24 @@ define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0
   ret float %min.a
 }
 
-; GCN-LABEL: {{^}}max_fneg_select_regression_0:
+; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0_fast:
 ; GCN-NOT: v_mul
 
-; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; VI: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
+define amdgpu_ps float @min_fneg_select_regression_posk_0_fast(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ult float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}max_fneg_select_regression_0:
+; GCN-NOT: v_mul
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
+; SI: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
 
-; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
+; VI: v_cmp_nge_f32_e32 vcc, 1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
 define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, 1.0
@@ -50,15 +63,24 @@ define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
   ret float %min.a
 }
 
-; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
+; GCN-LABEL: {{^}}max_fneg_select_regression_0_fast:
 ; GCN-NOT: v_mul
 
-; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
+define amdgpu_ps float @max_fneg_select_regression_0_fast(float %a) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ugt float %a, 1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
+; GCN-NOT: v_mul
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
+; SI: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
 
-; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
+; VI: v_cmp_nge_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, -1.0
@@ -66,13 +88,22 @@ define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0_fast:
+; GCN-NOT: v_mul
+
+; GCN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
+define amdgpu_ps float @max_fneg_select_regression_posk_0_fast(float %a) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ugt float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1:
 ; SI: v_min_legacy_f32_e64 v0, 1.0, -v0
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI: v_cmp_nge_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ugt float %a, -1.0
@@ -80,13 +111,21 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1_fast:
+
+; VI: v_min_f32_e64 v0, -v0, 1.0
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ugt float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1:
 ; SI: v_max_legacy_f32_e64 v0, 1.0, -v0
 
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NNAN: v_max_f32_e64 v0, -v0, 1.0
+; VI: v_cmp_nle_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ult float %a, -1.0
@@ -94,13 +133,21 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1_fast:
+
+; VI: v_max_f32_e64 v0, -v0, 1.0
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ult float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1:
 ; SI: v_min_legacy_f32_e64 v0, -v0, 1.0
 
-; VI-SAFE: v_cmp_lt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI: v_cmp_lt_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ogt float %a, -1.0
@@ -108,13 +155,21 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1_fast:
+
+; VI: v_min_f32_e64 v0, -v0, 1.0
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ogt float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1:
 ; SI: v_max_legacy_f32_e64 v0, -v0, 1.0
 
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NANN: v_max_f32_e64 v0, -v0, 1.0
+; VI: v_cmp_gt_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp olt float %a, -1.0
@@ -122,17 +177,24 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1_fast:
+
+; VI-NANN: v_max_f32_e64 v0, -v0, 1.0
+define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp olt float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0
 
-; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-
-; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
+; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
+; VI: v_cmp_nge_f32_e32 vcc, [[K0]], v0
+; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ugt float %a, -8.0
@@ -140,17 +202,25 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8_fast:
+
+; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ugt float %a, -8.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0
 
-; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-
-; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
+; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
+; VI: v_cmp_nle_f32_e32 vcc, [[K0]], v0
+; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ult float %a, -8.0
@@ -158,17 +228,25 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8_fast:
+
+; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ult float %a, -8.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]]
 
-; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
-; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-
-; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
+; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
+; VI: v_cmp_lt_f32_e32 vcc, [[K0]], v0
+; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ogt float %a, -8.0
@@ -176,18 +254,26 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8_fast:
+
+; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ogt float %a, -8.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]]
 
 
-; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-
-; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
+; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
+; VI: v_cmp_gt_f32_e32 vcc, [[K0]], v0
+; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp olt float %a, -8.0
@@ -195,13 +281,22 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8_fast:
+
+; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp olt float %a, -8.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1:
 ; SI: v_max_legacy_f32_e64 v0, -v0, -1.0
 
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
-
-; VI-NNAN: v_max_f32_e64 v0, -v0, -1.0
+; VI: v_cmp_gt_f32_e32 vcc, 1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp olt float %a, 1.0
@@ -209,15 +304,22 @@ define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1_fast:
+
+; VI: v_max_f32_e64 v0, -v0, -1.0
+define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp olt float %a, 1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}ult_a_select_fneg_a_b:
 ; SI: v_cmp_nge_f32_e32 vcc, v0, v1
 ; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
-
-; VI-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; VI: v_cmp_nge_f32_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ult float %a, %b
@@ -225,15 +327,23 @@ define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 {
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}ult_a_select_fneg_a_b_fast:
+
+; VI: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+define amdgpu_ps float @ult_a_select_fneg_a_b_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp nnan nsz ult float %a, %b
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float %b
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b:
 ; SI: v_cmp_nle_f32_e32 vcc, v0, v1
 ; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
-
-; VI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1
-; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; VI: v_cmp_nle_f32_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ugt float %a, %b
@@ -241,5 +351,16 @@ define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 {
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b_fast:
+
+; VI: v_cmp_gt_f32_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+define amdgpu_ps float @ugt_a_select_fneg_a_b_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp nnan nsz ugt float %a, %b
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float %b
+  ret float %min.a
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 34cb0b1..40c2ec0 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -1,735 +1,760 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI %s
 
-; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI %s
 
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 
 define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v1
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-TRUE16-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-NNAN-TRUE16:       ; %bb.0:
-; GFX11-NNAN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
-; GFX11-NNAN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-FAKE16-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-NNAN-FAKE16:       ; %bb.0:
-; GFX11-NNAN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NNAN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select i1 %cmp, half %a, half %b
   ret half %val
 }
 
+define half @test_fmin_legacy_ule_f16_fast(half %a, half %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16_fast:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_f16_fast:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select nnan nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
 define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v2, v0, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v2
-; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v3
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v1.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v1.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; VI-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; SI-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
   %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
 }
 
+define <2 x half> @test_fmin_legacy_ule_v2f16_fast(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_v2f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v2f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v2f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v2
+; SI-NEXT:    v_min_f32_e32 v1, v1, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmin_legacy_ule_v2f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
 define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v3, v0
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v4, v1
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v2, v5, v2
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v3
-; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v4
-; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v5
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v2.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v2.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v1.l, v3.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v3f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v3f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v3, v0
+; SI-NEXT:    v_min_legacy_f32_e32 v1, v4, v1
+; SI-NEXT:    v_min_legacy_f32_e32 v2, v5, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <3 x half> %a, %b
   %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
   ret <3 x half> %val
 }
 
+define <3 x half> @test_fmin_legacy_ule_v3f16_fast(<3 x half> %a, <3 x half> %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_v3f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v3f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v2
+; VI-NEXT:    v_min_f16_e32 v1, v1, v3
+; VI-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v3f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v3
+; SI-NEXT:    v_min_f32_e32 v1, v1, v4
+; SI-NEXT:    v_min_f32_e32 v2, v2, v5
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmin_legacy_ule_v3f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <3 x half> %a, %b
+  %val = select nnan nsz <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
+  ret <3 x half> %val
+}
+
 define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v1, v6, v1, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v4
-; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v5
-; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v6
-; SI-NNAN-NEXT:    v_min_f32_e32 v3, v3, v7
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1.h, v3.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.h, v2.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v0.l, v2.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v1.l, v3.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; SI-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; SI-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; SI-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
 }
 
+define <4 x half> @test_fmin_legacy_ule_v4f16_fast(<4 x half> %a, <4 x half> %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_v4f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v4f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v1, v1, v3
+; VI-NEXT:    v_min_f16_e32 v0, v0, v2
+; VI-NEXT:    v_or_b32_e32 v0, v0, v5
+; VI-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v4f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v4
+; SI-NEXT:    v_min_f32_e32 v1, v1, v5
+; SI-NEXT:    v_min_f32_e32 v2, v2, v6
+; SI-NEXT:    v_min_f32_e32 v3, v3, v7
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmin_legacy_ule_v4f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
 define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v7
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v2, v6
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v5
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v8, v0, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v1, v10, v1, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v2, v12, v2, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v3, v14, v3, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v4
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v2, v2, v6
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v3, v3, v7
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v7
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v2, v6
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v5
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
-; VI-SAFE-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
-; VI-SAFE-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v3, v3, v7
-; VI-NNAN-NEXT:    v_min_f16_e32 v2, v2, v6
-; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v5
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v8, v0
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v9, v1
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v2, v10, v2
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v3, v11, v3
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v4, v12, v4
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v5, v13, v5
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v6, v14, v6
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v7, v15, v7
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v8
-; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v9
-; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v10
-; SI-NNAN-NEXT:    v_min_f32_e32 v3, v3, v11
-; SI-NNAN-NEXT:    v_min_f32_e32 v4, v4, v12
-; SI-NNAN-NEXT:    v_min_f32_e32 v5, v5, v13
-; SI-NNAN-NEXT:    v_min_f32_e32 v6, v6, v14
-; SI-NNAN-NEXT:    v_min_f32_e32 v7, v7, v15
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v4.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v1.h, v5.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v2.h, v6.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v3.h, v7.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s3, v0.l, v4.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s4, v1.l, v5.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s5, v2.l, v6.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s6, v3.l, v7.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.h, v3.h, s2
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.h, v2.h, s1
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v2.l, v6.l, v2.l, s5
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v3.l, v7.l, v3.l, s6
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v11, v10
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v13, v12
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v15, v14
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v9, v8
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v2, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v4
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v5
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v7
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v4
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v2, v2, v6
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v3, v3, v7
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v8, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v10, v1, s4
+; GFX9-NEXT:    v_perm_b32 v2, v12, v2, s4
+; GFX9-NEXT:    v_perm_b32 v3, v14, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v8f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; VI-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
+; VI-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
+; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v7
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v2, v6
+; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v5
+; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v8f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v8, v0
+; SI-NEXT:    v_min_legacy_f32_e32 v1, v9, v1
+; SI-NEXT:    v_min_legacy_f32_e32 v2, v10, v2
+; SI-NEXT:    v_min_legacy_f32_e32 v3, v11, v3
+; SI-NEXT:    v_min_legacy_f32_e32 v4, v12, v4
+; SI-NEXT:    v_min_legacy_f32_e32 v5, v13, v5
+; SI-NEXT:    v_min_legacy_f32_e32 v6, v14, v6
+; SI-NEXT:    v_min_legacy_f32_e32 v7, v15, v7
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v1.h, v5.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v2.h, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v3.h, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s3, v0.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s4, v1.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s5, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s6, v3.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.h, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v6.l, v2.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v7.l, v3.l, s6
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v11, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v9, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v2, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <8 x half> %a, %b
   %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
   ret <8 x half> %val
 }
 
+define <8 x half> @test_fmin_legacy_ule_v8f16_fast(<8 x half> %a, <8 x half> %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_v8f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX9-NEXT:    v_pk_min_f16 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v8f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v3, v3, v7
+; VI-NEXT:    v_min_f16_e32 v2, v2, v6
+; VI-NEXT:    v_min_f16_e32 v1, v1, v5
+; VI-NEXT:    v_min_f16_e32 v0, v0, v4
+; VI-NEXT:    v_or_b32_e32 v0, v0, v11
+; VI-NEXT:    v_or_b32_e32 v1, v1, v10
+; VI-NEXT:    v_or_b32_e32 v2, v2, v9
+; VI-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v8f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v8
+; SI-NEXT:    v_min_f32_e32 v1, v1, v9
+; SI-NEXT:    v_min_f32_e32 v2, v2, v10
+; SI-NEXT:    v_min_f32_e32 v3, v3, v11
+; SI-NEXT:    v_min_f32_e32 v4, v4, v12
+; SI-NEXT:    v_min_f32_e32 v5, v5, v13
+; SI-NEXT:    v_min_f32_e32 v6, v6, v14
+; SI-NEXT:    v_min_f32_e32 v7, v7, v15
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmin_legacy_ule_v8f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <8 x half> %a, %b
+  %val = select nnan nsz <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %val
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index ec4dd85..defcffa 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -1,8 +1,6 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN,FUNC %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,FUNC %s
 
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s
 
@@ -14,13 +12,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32:
 ; EG: MIN *
-; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
-; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-
-; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-
-; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(ptr addrspace(1) %out, <4 x float> %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
@@ -30,22 +24,32 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(ptr addrspace(1)
    ret void
 }
 
-; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
-; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32_fast:
 
-; SI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]]
+; SI: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
-; GCN-NONAN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]]
+; VI: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32_fast(ptr addrspace(1) %out, <4 x float> %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp nnan nsz uge float %r0, %r1
+   %r3 = select nnan nsz i1 %r2, float %r1, float %r0
+   store float %r3, ptr addrspace(1) %out
+   ret void
+}
 
-; VI-SAFE: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]]
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
+; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+
+; SI: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[#LOAD + 3]], [[VA]]
+; VI: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]]
 
-; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]]
-; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[#LOAD + 2]], [[VB]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[#LOAD + 3]], [[VA]]
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[#LOAD + 2]], [[VB]]
+; VI: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]]
+; VI: v_cmp_ngt_f32_e32 vcc, s[[#LOAD + 2]], [[VB]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -53,6 +57,19 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, flo
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32_fast:
+; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]]
+
+; GCN: v_min_f32_e32 {{v[0-9]+}}, s[[#LOAD + 2]], [[VB]]
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_fast(ptr addrspace(1) %out, float %a, float %b) #0 {
+  %cmp = fcmp ule float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; Nsz also needed
 ; FIXME: Should separate tests
 ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src:
@@ -61,12 +78,10 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, flo
 ; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0
 ; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
-
-; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
-; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
+; VI: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(ptr addrspace(1) %out, float %a, float %b) #0 {
   %a.nnan = fadd nnan float %a, 1.0
   %b.nnan = fadd nnan float %b, 2.0
@@ -76,16 +91,32 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(ptr addrspace(1)
   ret void
 }
 
+; Nsz also needed
+; FIXME: Should separate tests
+; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src_fast:
+; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+
+; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0
+; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0
+
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src_fast(ptr addrspace(1) %out, float %a, float %b) #0 {
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+  %cmp = fcmp ule float %a.nnan, %b.nnan
+  %val = select nnan nsz i1 %cmp, float %a.nnan, float %b.nnan
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 
-; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
-
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; VI: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_ule_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -100,16 +131,33 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32:
+; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_ule_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; VI-SAFE: v_cmp_le_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %cmp = fcmp ule float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI: v_cmp_le_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_ole_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -124,16 +172,33 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32:
+; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_ole_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
+
+  %cmp = fcmp ole float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; VI: v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_olt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -148,16 +213,33 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32:
+; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_olt_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+  %cmp = fcmp olt float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI: v_cmp_nge_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_ult_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -172,16 +254,33 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_ult_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+  %cmp = fcmp ult float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI: v_cmp_nge_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid
@@ -196,19 +295,35 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_ult_v1f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <1 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile <1 x float>, ptr addrspace(1) %gep.0
+  %b = load volatile <1 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ult <1 x float> %a, %b
+  %val = select nnan nsz <1 x i1> %cmp, <1 x float> %a, <1 x float> %b
+  store <1 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32:
 ; GCN: {{buffer|flat}}_load_dwordx2
 ; GCN: {{buffer|flat}}_load_dwordx2
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-SAFE: v_min_legacy_f32_e32
-
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
+; SI: v_min_legacy_f32_e32
+; SI: v_min_legacy_f32_e32
 
-; GCN-NONAN: v_min_f32_e32
-; GCN-NONAN: v_min_f32_e32
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
 define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %tid
@@ -223,25 +338,40 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32_fast:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+
+; GCN: v_min_f32_e32
+; GCN: v_min_f32_e32
+define amdgpu_kernel void @test_fmin_legacy_ult_v2f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <2 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile <2 x float>, ptr addrspace(1) %gep.0
+  %b = load volatile <2 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ult <2 x float> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  store <2 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32:
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-SAFE-NOT: v_min_
-
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
+; SI: v_min_legacy_f32_e32
+; SI: v_min_legacy_f32_e32
+; SI: v_min_legacy_f32_e32
+; SI-NOT: v_min_
+
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
 ; VI-NOT: v_cmp
 ; VI-NOT: v_cndmask
-
-; GCN-NONAN: v_min_f32_e32
-; GCN-NONAN: v_min_f32_e32
-; GCN-NONAN: v_min_f32_e32
-; GCN-NONAN-NOT: v_min_
 define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid
@@ -256,6 +386,28 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32_fast:
+; VI-NOT: v_cmp
+; VI-NOT: v_cndmask
+
+; GCN: v_min_f32_e32
+; GCN: v_min_f32_e32
+; GCN: v_min_f32_e32
+; GCN-NOT: v_min_
+define amdgpu_kernel void @test_fmin_legacy_ult_v3f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <3 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load <3 x float>, ptr addrspace(1) %gep.0
+  %b = load <3 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ult <3 x float> %a, %b
+  %val = select nnan nsz <3 x i1> %cmp, <3 x float> %a, <3 x float> %b
+  store <3 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_multi_use:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
index f01e85a..65111f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
@@ -4,7 +4,7 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefix=GFX11 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
+;RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
 ; PREGFX10-LABEL: tbuffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
index b555c37..a6afb757 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
@@ -4,7 +4,7 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+;RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
 ; PREGFX10-LABEL: tbuffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index 8a9beb7..4c7cef9 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 @bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
 @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@@ -102,6 +102,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, 2
 ; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst -1
 ; GFX12-SDAG-NEXT:    s_barrier_wait 1
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_barrier_leave
 ; GFX12-SDAG-NEXT:    s_get_barrier_state s3, m0
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
@@ -155,10 +156,11 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-GISEL-NEXT:    s_barrier_signal -1
 ; GFX12-GISEL-NEXT:    s_barrier_join m0
 ; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst -1
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s12, 48
 ; GFX12-GISEL-NEXT:    s_barrier_wait 1
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_barrier_leave
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s12, 48
 ; GFX12-GISEL-NEXT:    s_get_barrier_state s0, 2
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_get_barrier_state s0, m0
@@ -256,6 +258,25 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
     ret void
 }
 
+define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_barrier_leave_write_to_scc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_barrier_leave
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12-NEXT:    s_movk_i32 s0, 0x7b
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cselect_b32 s0, s0, 0x1c8
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.barrier.leave(i16 1)
+  %cmp = icmp ne i32 %val, 0
+  %ret = select i1 %cmp, i32 123, i32 456
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
 declare void @llvm.amdgcn.s.barrier() #1
 declare void @llvm.amdgcn.s.barrier.wait(i16) #1
 declare void @llvm.amdgcn.s.barrier.signal(i32) #1
diff --git a/llvm/test/CodeGen/ARM/issue159343.ll b/llvm/test/CodeGen/ARM/issue159343.ll
new file mode 100644
index 0000000..0329258
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/issue159343.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s | FileCheck %s
+
+; Make sure there's no assertion from peephole-opt introducing illegal
+; subregister index uses.
+
+target triple = "thumbv7-unknown-linux-android29"
+
+define void @_ZN11VersionEdit10DecodeFromEv(i1 %call4, ptr %__profc__ZN11VersionEdit10DecodeFromEv) nounwind {
+; CHECK-LABEL: _ZN11VersionEdit10DecodeFromEv:
+; CHECK:       @ %bb.0: @ %land.rhs.lr.ph
+; CHECK-NEXT:    lsls r0, r0, #31
+; CHECK-NEXT:    beq .LBB0_2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:    adr r0, .LCPI0_0
+; CHECK-NEXT:    vld1.64 {d0, d1}, [r0:128]
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_2: @ %select.false
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:  .LBB0_3: @ %select.end
+; CHECK-NEXT:    vldr s5, .LCPI0_1
+; CHECK-NEXT:    vldr s4, .LCPI0_2
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vst1.64 {d2, d3}, [r1]
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.4:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:  .LCPI0_1:
+; CHECK-NEXT:    .long 0x00000000 @ float 0
+; CHECK-NEXT:  .LCPI0_2:
+; CHECK-NEXT:    .long 0x00000001 @ float 1.40129846E-45
+land.rhs.lr.ph:
+  br i1 %call4, label %sw.bb, label %while.cond.while.end_crit_edge.split.loop.exit43
+
+while.cond.while.end_crit_edge.split.loop.exit43: ; preds = %land.rhs.lr.ph
+  %ext0 = extractelement <4 x i64> zeroinitializer, i64 0
+  br label %while.cond.while.end_crit_edge
+
+while.cond.while.end_crit_edge:                   ; preds = %sw.bb, %while.cond.while.end_crit_edge.split.loop.exit43
+  %pgocount5374.ph = phi i64 [ %ext1, %sw.bb ], [ %ext0, %while.cond.while.end_crit_edge.split.loop.exit43 ]
+  %ins = insertelement <2 x i64> splat (i64 1), i64 %pgocount5374.ph, i64 1
+  store <2 x i64> %ins, ptr %__profc__ZN11VersionEdit10DecodeFromEv, align 8
+  ret void
+
+sw.bb:                                            ; preds = %land.rhs.lr.ph
+  %ext1 = extractelement <4 x i64> splat (i64 1), i64 0
+  br label %while.cond.while.end_crit_edge
+}
+
diff --git a/llvm/test/CodeGen/ARM/pr159343.mir b/llvm/test/CodeGen/ARM/pr159343.mir
new file mode 100644
index 0000000..9b71b1a
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/pr159343.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -run-pass=peephole-opt -verify-machineinstrs -mtriple=thumbv7-unknown-linux-android29 %s -o - | FileCheck %s
+---
+name: Test_shouldRewriteCopySrc_Invalid_SubReg
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $r0, $r1
+
+    ; CHECK-LABEL: name: Test_shouldRewriteCopySrc_Invalid_SubReg
+    ; CHECK: liveins: $r0, $r1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:dpair = IMPLICIT_DEF
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:dpr_vfp2 = COPY [[DEF]].dsub_0
+    ; CHECK-NEXT: [[VMOVRRD:%[0-9]+]]:gpr, [[VMOVRRD1:%[0-9]+]]:gpr = VMOVRRD [[COPY]], 14 /* CC::al */, $noreg
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:spr = COPY [[COPY]].ssub_1
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:spr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:spr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:spr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:mqpr = REG_SEQUENCE killed [[DEF2]], %subreg.ssub_0, killed [[DEF1]], %subreg.ssub_1, killed [[DEF3]], %subreg.ssub_2, [[COPY]].ssub_1, %subreg.ssub_3
+    ; CHECK-NEXT: VST1q64 $r1, 0, killed [[REG_SEQUENCE]], 14 /* CC::al */, $noreg
+    %0:dpair = IMPLICIT_DEF
+    %1:dpr = COPY %0.dsub_0
+    %2:gpr, %3:gpr = VMOVRRD killed %1, 14 /* CC::al */, $noreg
+    %4:spr = VMOVSR killed %3, 14 /* CC::al */, $noreg
+    %5:spr = IMPLICIT_DEF
+    %6:spr = IMPLICIT_DEF
+    %7:spr = IMPLICIT_DEF
+    %8:mqpr = REG_SEQUENCE killed %6, %subreg.ssub_0, killed %5, %subreg.ssub_1, killed %7, %subreg.ssub_2, killed %4, %subreg.ssub_3
+    VST1q64 $r1, 0, killed %8, 14 /* CC::al */, $noreg
+...
diff --git a/llvm/test/CodeGen/ARM/shouldRewriteCopySrc.ll b/llvm/test/CodeGen/ARM/shouldRewriteCopySrc.ll
index e653aaa..2bf8f29 100644
--- a/llvm/test/CodeGen/ARM/shouldRewriteCopySrc.ll
+++ b/llvm/test/CodeGen/ARM/shouldRewriteCopySrc.ll
@@ -12,8 +12,8 @@ define float @shouldRewriteCopySrc(double %arg) #0 {
 ; CHECK-NEXT:    @APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    @NO_APP
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vmov.f64 d0, d16
+; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $d0
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 bb:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll
new file mode 100644
index 0000000..e1784f8
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+;; xvinsve0.w
+define void @xvinsve0_v8i32_l_0(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8i32_l_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %vc = shufflevector <8 x i32> %va, <8 x i32> %vb, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8i32_l_4(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8i32_l_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 4
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %vc = shufflevector <8 x i32> %va, <8 x i32> %vb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7>
+  store <8 x i32> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8f32_l(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8f32_l:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x float>, ptr %a
+  %vb = load <8 x float>, ptr %b
+  %vc = shufflevector <8 x float> %va, <8 x float> %vb, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8i32_h_1(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8i32_h_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 1
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %vc = shufflevector <8 x i32> %va, <8 x i32> %vb, <8 x i32> <i32 8, i32 0, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x i32> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8i32_h_6(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8i32_h_6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 6
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %vc = shufflevector <8 x i32> %va, <8 x i32> %vb, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 0, i32 15>
+  store <8 x i32> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8f32_h(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8f32_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 0
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x float>, ptr %a
+  %vb = load <8 x float>, ptr %b
+  %vc = shufflevector <8 x float> %va, <8 x float> %vb, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x float> %vc, ptr %d
+  ret void
+}
+
+;; xvinsve0.d
+define void @xvinsve0_v4i64_l_1(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4i64_l_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %vc = shufflevector <4 x i64> %va, <4 x i64> %vb, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  store <4 x i64> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4i64_l_2(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4i64_l_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %vc = shufflevector <4 x i64> %va, <4 x i64> %vb, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  store <4 x i64> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4f64_l(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4f64_l:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x double>, ptr %a
+  %vb = load <4 x double>, ptr %b
+  %vc = shufflevector <4 x double> %va, <4 x double> %vb, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  store <4 x double> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4i64_h_0(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4i64_h_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.d $xr1, $xr0, 0
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %vc = shufflevector <4 x i64> %va, <4 x i64> %vb, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  store <4 x i64> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4i64_h_2(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4i64_h_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.d $xr1, $xr0, 2
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %vc = shufflevector <4 x i64> %va, <4 x i64> %vb, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
+  store <4 x i64> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4f64_h(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4f64_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.d $xr1, $xr0, 0
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x double>, ptr %a
+  %vb = load <4 x double>, ptr %b
+  %vc = shufflevector <4 x double> %va, <4 x double> %vb, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  store <4 x double> %vc, ptr %d
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll
index 055c512..71e180b 100644
--- a/llvm/test/CodeGen/NVPTX/bug22322.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22322.ll
@@ -20,12 +20,12 @@ _ZL11compute_vecRK6float3jb.exit:
   call void @llvm.lifetime.start.p0(i64 4, ptr %ret_vec.sroa.8.i)
   %6 = and i32 %4, 15
   %7 = icmp eq i32 %6, 0
-  %8 = select i1 %7, float 0.000000e+00, float -1.000000e+00
+  %8 = select nnan nsz i1 %7, float 0.000000e+00, float -1.000000e+00
   store float %8, ptr %ret_vec.sroa.8.i, align 4
 ; CHECK: max.f32 %r{{[0-9]+}}, %r{{[0-9]+}}, 0f00000000
   %9 = fcmp olt float %8, 0.000000e+00
   %ret_vec.sroa.8.i.val = load float, ptr %ret_vec.sroa.8.i, align 4
-  %10 = select i1 %9, float 0.000000e+00, float %ret_vec.sroa.8.i.val
+  %10 = select nnan nsz i1 %9, float 0.000000e+00, float %ret_vec.sroa.8.i.val
   call void @llvm.lifetime.end.p0(i64 4, ptr %ret_vec.sroa.8.i)
   %11 = getelementptr inbounds %class.float3, ptr %dst, i64 %5, i32 0
   store float 0.000000e+00, ptr %11, align 4
@@ -51,7 +51,7 @@ declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "no-signed-zeros-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/PowerPC/scalar-min-max.ll b/llvm/test/CodeGen/PowerPC/scalar-min-max.ll
index 216d498..5f637e3 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-min-max.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-min-max.ll
@@ -1,36 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
-; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
-; RUN:   --enable-no-nans-fp-math \
-; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
-; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
-; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
-; RUN:   --enable-no-nans-fp-math \
-; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
 ; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names -verify-machineinstrs \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
-; RUN:   --check-prefix=NO-FAST-P9
+; RUN:   --check-prefix=P9
 ; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -verify-machineinstrs \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
-; RUN:   --check-prefix=NO-FAST-P8
+; RUN:   --check-prefix=P8
 define dso_local float @testfmax(float %a, float %b) local_unnamed_addr {
-; CHECK-LABEL: testfmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmaxdp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testfmax:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmaxcdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testfmax:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P8-NEXT:    bgtlr cr0
-; NO-FAST-P8-NEXT:  # %bb.1: # %entry
-; NO-FAST-P8-NEXT:    fmr f1, f2
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testfmax:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmaxcdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testfmax:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    fcmpu cr0, f1, f2
+; P8-NEXT:    bgtlr cr0
+; P8-NEXT:  # %bb.1: # %entry
+; P8-NEXT:    fmr f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp ogt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
@@ -38,23 +25,18 @@ entry:
 }
 
 define dso_local double @testdmax(double %a, double %b) local_unnamed_addr {
-; CHECK-LABEL: testdmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmaxdp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testdmax:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmaxcdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testdmax:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    bgtlr cr0
-; NO-FAST-P8-NEXT:  # %bb.1: # %entry
-; NO-FAST-P8-NEXT:    fmr f1, f2
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testdmax:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmaxcdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testdmax:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xscmpudp cr0, f1, f2
+; P8-NEXT:    bgtlr cr0
+; P8-NEXT:  # %bb.1: # %entry
+; P8-NEXT:    fmr f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp ogt double %a, %b
   %cond = select i1 %cmp, double %a, double %b
@@ -62,23 +44,18 @@ entry:
 }
 
 define dso_local float @testfmin(float %a, float %b) local_unnamed_addr {
-; CHECK-LABEL: testfmin:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmindp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testfmin:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmincdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testfmin:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P8-NEXT:    bltlr cr0
-; NO-FAST-P8-NEXT:  # %bb.1: # %entry
-; NO-FAST-P8-NEXT:    fmr f1, f2
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testfmin:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmincdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testfmin:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    fcmpu cr0, f1, f2
+; P8-NEXT:    bltlr cr0
+; P8-NEXT:  # %bb.1: # %entry
+; P8-NEXT:    fmr f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp olt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
@@ -86,23 +63,18 @@ entry:
 }
 
 define dso_local double @testdmin(double %a, double %b) local_unnamed_addr {
-; CHECK-LABEL: testdmin:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmindp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testdmin:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmincdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testdmin:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    bltlr cr0
-; NO-FAST-P8-NEXT:  # %bb.1: # %entry
-; NO-FAST-P8-NEXT:    fmr f1, f2
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testdmin:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmincdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testdmin:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xscmpudp cr0, f1, f2
+; P8-NEXT:    bltlr cr0
+; P8-NEXT:  # %bb.1: # %entry
+; P8-NEXT:    fmr f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp olt double %a, %b
   %cond = select i1 %cmp, double %a, double %b
@@ -110,86 +82,62 @@ entry:
 }
 
 define dso_local float @testfmax_fast(float %a, float %b) local_unnamed_addr {
-; CHECK-LABEL: testfmax_fast:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmaxdp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testfmax_fast:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmaxcdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testfmax_fast:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xssubsp f0, f2, f1
-; NO-FAST-P8-NEXT:    fsel f1, f0, f2, f1
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testfmax_fast:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmaxdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testfmax_fast:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xsmaxdp f1, f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp nnan ninf ogt float %a, %b
-  %cond = select i1 %cmp, float %a, float %b
+  %cond = select nnan nsz i1 %cmp, float %a, float %b
   ret float %cond
 }
 define dso_local double @testdmax_fast(double %a, double %b) local_unnamed_addr {
-; CHECK-LABEL: testdmax_fast:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmaxdp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testdmax_fast:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmaxcdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testdmax_fast:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xssubdp f0, f2, f1
-; NO-FAST-P8-NEXT:    fsel f1, f0, f2, f1
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testdmax_fast:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmaxdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testdmax_fast:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xsmaxdp f1, f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp nnan ninf ogt double %a, %b
-  %cond = select i1 %cmp, double %a, double %b
+  %cond = select nnan nsz i1 %cmp, double %a, double %b
   ret double %cond
 }
 define dso_local float @testfmin_fast(float %a, float %b) local_unnamed_addr {
-; CHECK-LABEL: testfmin_fast:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmindp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testfmin_fast:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmincdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testfmin_fast:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xssubsp f0, f1, f2
-; NO-FAST-P8-NEXT:    fsel f1, f0, f2, f1
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testfmin_fast:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmindp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testfmin_fast:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xsmindp f1, f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp nnan ninf olt float %a, %b
-  %cond = select i1 %cmp, float %a, float %b
+  %cond = select nnan nsz i1 %cmp, float %a, float %b
   ret float %cond
 }
 define dso_local double @testdmin_fast(double %a, double %b) local_unnamed_addr {
-; CHECK-LABEL: testdmin_fast:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmindp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testdmin_fast:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmincdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testdmin_fast:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xssubdp f0, f1, f2
-; NO-FAST-P8-NEXT:    fsel f1, f0, f2, f1
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testdmin_fast:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmindp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testdmin_fast:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xsmindp f1, f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp nnan ninf olt double %a, %b
-  %cond = select i1 %cmp, double %a, double %b
+  %cond = select nnan nsz i1 %cmp, double %a, double %b
   ret double %cond
 }
diff --git a/llvm/test/CodeGen/RISCV/select-bare.ll b/llvm/test/CodeGen/RISCV/select-bare.ll
index 796121a..44028a7 100644
--- a/llvm/test/CodeGen/RISCV/select-bare.ll
+++ b/llvm/test/CodeGen/RISCV/select-bare.ll
@@ -26,8 +26,8 @@ define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind {
 ; RV32IXQCI-LABEL: bare_select:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %1 = select i1 %a, i32 %b, i32 %c
   ret i32 %1
@@ -53,8 +53,8 @@ define float @bare_select_float(i1 %a, float %b, float %c) nounwind {
 ; RV32IXQCI-LABEL: bare_select_float:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %1 = select i1 %a, float %b, float %c
   ret float %1
diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll
index 14055df..b57f625 100644
--- a/llvm/test/CodeGen/RISCV/select-cc.ll
+++ b/llvm/test/CodeGen/RISCV/select-cc.ll
@@ -87,40 +87,40 @@ define signext i32 @foo(i32 signext %a, ptr %b) nounwind {
 ;
 ; RV32IXQCI-LABEL: foo:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    lw a5, 0(a1)
 ; RV32IXQCI-NEXT:    lw a2, 0(a1)
 ; RV32IXQCI-NEXT:    lw a4, 0(a1)
 ; RV32IXQCI-NEXT:    lw t5, 0(a1)
 ; RV32IXQCI-NEXT:    lw t4, 0(a1)
+; RV32IXQCI-NEXT:    lw t3, 0(a1)
 ; RV32IXQCI-NEXT:    lw t2, 0(a1)
-; RV32IXQCI-NEXT:    lw t1, 0(a1)
 ; RV32IXQCI-NEXT:    lw t0, 0(a1)
 ; RV32IXQCI-NEXT:    lw a7, 0(a1)
 ; RV32IXQCI-NEXT:    lw a6, 0(a1)
-; RV32IXQCI-NEXT:    lw t3, 0(a1)
 ; RV32IXQCI-NEXT:    lw a3, 0(a1)
-; RV32IXQCI-NEXT:    bltz t3, .LBB0_2
+; RV32IXQCI-NEXT:    lw t1, 0(a1)
+; RV32IXQCI-NEXT:    lw a5, 0(a1)
+; RV32IXQCI-NEXT:    bltz t1, .LBB0_2
 ; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    li t6, 0
-; RV32IXQCI-NEXT:    qc.mveq a5, a0, a5, a0
-; RV32IXQCI-NEXT:    qc.mvne a2, a5, a2, a5
-; RV32IXQCI-NEXT:    qc.mvltu a4, a4, a2, a2
-; RV32IXQCI-NEXT:    qc.mvgeu t5, a4, t5, a4
-; RV32IXQCI-NEXT:    qc.mvltu t4, t5, t4, t5
-; RV32IXQCI-NEXT:    qc.mvgeu t2, t2, t4, t4
-; RV32IXQCI-NEXT:    qc.mvlt t1, t1, t2, t2
-; RV32IXQCI-NEXT:    qc.mvge t0, t1, t0, t1
-; RV32IXQCI-NEXT:    qc.mvlt a7, t0, a7, t0
-; RV32IXQCI-NEXT:    qc.mvge a6, a6, a7, a7
-; RV32IXQCI-NEXT:    mv a3, t3
-; RV32IXQCI-NEXT:    qc.mvge a3, t6, t3, a6
+; RV32IXQCI-NEXT:    li a5, 0
+; RV32IXQCI-NEXT:    qc.mveq a2, a0, a2, a0
+; RV32IXQCI-NEXT:    qc.mvne a4, a2, a4, a2
+; RV32IXQCI-NEXT:    qc.mvltu t5, t5, a4, a4
+; RV32IXQCI-NEXT:    qc.mvgeu t4, t5, t4, t5
+; RV32IXQCI-NEXT:    qc.mvltu t3, t4, t3, t4
+; RV32IXQCI-NEXT:    qc.mvgeu t2, t2, t3, t3
+; RV32IXQCI-NEXT:    qc.mvlt t0, t0, t2, t2
+; RV32IXQCI-NEXT:    qc.mvge a7, t0, a7, t0
+; RV32IXQCI-NEXT:    qc.mvlt a6, a7, a6, a7
+; RV32IXQCI-NEXT:    qc.mvge a3, a3, a6, a6
+; RV32IXQCI-NEXT:    qc.mvlt a3, a5, t1, t1
+; RV32IXQCI-NEXT:    mv a5, a3
 ; RV32IXQCI-NEXT:  .LBB0_2:
 ; RV32IXQCI-NEXT:    lw a2, 0(a1)
 ; RV32IXQCI-NEXT:    lw a0, 0(a1)
 ; RV32IXQCI-NEXT:    li a1, 1024
-; RV32IXQCI-NEXT:    qc.mvlt a2, a1, a2, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a1, a2, a5
 ; RV32IXQCI-NEXT:    li a1, 2046
-; RV32IXQCI-NEXT:    qc.mvltu a0, a1, t3, a2
+; RV32IXQCI-NEXT:    qc.mvltu a0, a1, t1, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: foo:
@@ -417,8 +417,8 @@ define i32 @select_sge_int16min(i32 signext %x, i32 signext %y, i32 signext %z)
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    lui a3, 1048560
 ; RV32IXQCI-NEXT:    addi a3, a3, -1
-; RV32IXQCI-NEXT:    qc.mvlt a2, a3, a0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mvge a1, a3, a0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: select_sge_int16min:
@@ -471,10 +471,10 @@ define i64 @select_sge_int32min(i64 %x, i64 %y, i64 %z) {
 ; RV32IXQCI-NEXT:    srli a0, a1, 31
 ; RV32IXQCI-NEXT:    xori a0, a0, 1
 ; RV32IXQCI-NEXT:    qc.mveqi a0, a1, -1, a6
-; RV32IXQCI-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32IXQCI-NEXT:    qc.mvnei a5, a0, 0, a3
-; RV32IXQCI-NEXT:    mv a0, a4
-; RV32IXQCI-NEXT:    mv a1, a5
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 0, a5
+; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    mv a1, a3
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: select_sge_int32min:
diff --git a/llvm/test/CodeGen/RISCV/select-cond.ll b/llvm/test/CodeGen/RISCV/select-cond.ll
index b88fe9a..3ca0f46 100644
--- a/llvm/test/CodeGen/RISCV/select-cond.ll
+++ b/llvm/test/CodeGen/RISCV/select-cond.ll
@@ -35,8 +35,8 @@ define signext i32 @select_i32_trunc(i32 signext %cond, i32 signext %x, i32 sign
 ; RV32-XQCICM-LABEL: select_i32_trunc:
 ; RV32-XQCICM:       # %bb.0:
 ; RV32-XQCICM-NEXT:    andi a0, a0, 1
-; RV32-XQCICM-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32-XQCICM-NEXT:    mv a0, a2
+; RV32-XQCICM-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32-XQCICM-NEXT:    mv a0, a1
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_trunc:
@@ -48,8 +48,8 @@ define signext i32 @select_i32_trunc(i32 signext %cond, i32 signext %x, i32 sign
 ; RV32IXQCI-LABEL: select_i32_trunc:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_trunc:
@@ -93,8 +93,8 @@ define signext i32 @select_i32_param(i1 signext %cond, i32 signext %x, i32 signe
 ; RV32-XQCICM-LABEL: select_i32_param:
 ; RV32-XQCICM:       # %bb.0:
 ; RV32-XQCICM-NEXT:    andi a0, a0, 1
-; RV32-XQCICM-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32-XQCICM-NEXT:    mv a0, a2
+; RV32-XQCICM-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32-XQCICM-NEXT:    mv a0, a1
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_param:
@@ -106,8 +106,8 @@ define signext i32 @select_i32_param(i1 signext %cond, i32 signext %x, i32 signe
 ; RV32IXQCI-LABEL: select_i32_param:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_param:
@@ -148,8 +148,8 @@ define signext i32 @select_i32_eq(i32 signext %a, i32 signext %b, i32 signext %x
 ;
 ; RV32-XQCICM-LABEL: select_i32_eq:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mveq a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvne a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_eq:
@@ -163,8 +163,8 @@ define signext i32 @select_i32_eq(i32 signext %a, i32 signext %b, i32 signext %x
 ;
 ; RV32IXQCI-LABEL: select_i32_eq:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mveq a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvne a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_eq:
@@ -205,8 +205,8 @@ define signext i32 @select_i32_ne(i32 signext %a, i32 signext %b, i32 signext %x
 ;
 ; RV32-XQCICM-LABEL: select_i32_ne:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvne a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mveq a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_ne:
@@ -220,8 +220,8 @@ define signext i32 @select_i32_ne(i32 signext %a, i32 signext %b, i32 signext %x
 ;
 ; RV32IXQCI-LABEL: select_i32_ne:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvne a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveq a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_ne:
@@ -262,8 +262,8 @@ define signext i32 @select_i32_ugt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_ugt:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvltu a3, a1, a0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvgeu a2, a1, a0, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_ugt:
@@ -277,8 +277,8 @@ define signext i32 @select_i32_ugt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_ugt:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvltu a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_ugt:
@@ -319,8 +319,8 @@ define signext i32 @select_i32_uge(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_uge:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvgeu a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvltu a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_uge:
@@ -334,8 +334,8 @@ define signext i32 @select_i32_uge(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_uge:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvgeu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_uge:
@@ -376,8 +376,8 @@ define signext i32 @select_i32_ult(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_ult:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_ult:
@@ -391,8 +391,8 @@ define signext i32 @select_i32_ult(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_ult:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_ult:
@@ -433,8 +433,8 @@ define signext i32 @select_i32_ule(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_ule:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvgeu a3, a1, a0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvltu a2, a1, a0, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_ule:
@@ -448,8 +448,8 @@ define signext i32 @select_i32_ule(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_ule:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvgeu a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltu a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_ule:
@@ -490,8 +490,8 @@ define signext i32 @select_i32_sgt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_sgt:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvlt a3, a1, a0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvge a2, a1, a0, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_sgt:
@@ -505,8 +505,8 @@ define signext i32 @select_i32_sgt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_sgt:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvlt a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvge a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_sgt:
@@ -547,8 +547,8 @@ define signext i32 @select_i32_sge(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_sge:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvge a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvlt a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_sge:
@@ -562,8 +562,8 @@ define signext i32 @select_i32_sge(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_sge:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvge a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_sge:
@@ -604,8 +604,8 @@ define signext i32 @select_i32_slt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_slt:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvlt a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvge a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_slt:
@@ -619,8 +619,8 @@ define signext i32 @select_i32_slt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_slt:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvlt a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvge a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_slt:
@@ -661,8 +661,8 @@ define signext i32 @select_i32_sle(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_sle:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvge a3, a1, a0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvlt a2, a1, a0, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_sle:
@@ -676,8 +676,8 @@ define signext i32 @select_i32_sle(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_sle:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvge a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_sle:
@@ -723,11 +723,11 @@ define i64 @select_i64_trunc(i64 %cond, i64 %x, i64 %y) nounwind {
 ;
 ; RV32-XQCICM-LABEL: select_i64_trunc:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    mv a1, a5
+; RV32-XQCICM-NEXT:    mv a1, a3
 ; RV32-XQCICM-NEXT:    andi a0, a0, 1
-; RV32-XQCICM-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32-XQCICM-NEXT:    qc.mvnei a1, a0, 0, a3
-; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32-XQCICM-NEXT:    qc.mveqi a1, a0, 0, a5
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_trunc:
@@ -740,11 +740,11 @@ define i64 @select_i64_trunc(i64 %cond, i64 %x, i64 %y) nounwind {
 ;
 ; RV32IXQCI-LABEL: select_i64_trunc:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    mv a1, a5
+; RV32IXQCI-NEXT:    mv a1, a3
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32IXQCI-NEXT:    qc.mvnei a1, a0, 0, a3
-; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a5
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_trunc:
@@ -792,10 +792,10 @@ define i64 @select_i64_param(i1 %cond, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-LABEL: select_i64_param:
 ; RV32-XQCICM:       # %bb.0:
 ; RV32-XQCICM-NEXT:    andi a0, a0, 1
-; RV32-XQCICM-NEXT:    qc.mvnei a3, a0, 0, a1
-; RV32-XQCICM-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
-; RV32-XQCICM-NEXT:    mv a1, a4
+; RV32-XQCICM-NEXT:    qc.mveqi a1, a0, 0, a3
+; RV32-XQCICM-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32-XQCICM-NEXT:    mv a0, a1
+; RV32-XQCICM-NEXT:    mv a1, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_param:
@@ -810,10 +810,10 @@ define i64 @select_i64_param(i1 %cond, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-LABEL: select_i64_param:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 0, a1
-; RV32IXQCI-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
-; RV32IXQCI-NEXT:    mv a1, a4
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32IXQCI-NEXT:    mv a0, a1
+; RV32IXQCI-NEXT:    mv a1, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_param:
@@ -866,10 +866,10 @@ define i64 @select_i64_eq(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    xor a1, a1, a3
 ; RV32-XQCICM-NEXT:    xor a0, a0, a2
 ; RV32-XQCICM-NEXT:    or a0, a0, a1
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a0, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a0, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a0, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a0, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_eq:
@@ -887,10 +887,10 @@ define i64 @select_i64_eq(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    xor a1, a1, a3
 ; RV32IXQCI-NEXT:    xor a0, a0, a2
 ; RV32IXQCI-NEXT:    or a0, a0, a1
-; RV32IXQCI-NEXT:    qc.mveqi a6, a0, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a0, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a0, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a0, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_eq:
@@ -943,10 +943,10 @@ define i64 @select_i64_ne(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    xor a1, a1, a3
 ; RV32-XQCICM-NEXT:    xor a0, a0, a2
 ; RV32-XQCICM-NEXT:    or a0, a0, a1
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a0, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a0, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a0, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a0, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_ne:
@@ -964,10 +964,10 @@ define i64 @select_i64_ne(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    xor a1, a1, a3
 ; RV32IXQCI-NEXT:    xor a0, a0, a2
 ; RV32IXQCI-NEXT:    or a0, a0, a1
-; RV32IXQCI-NEXT:    qc.mvnei a6, a0, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a0, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a0, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a0, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_ne:
@@ -1025,10 +1025,10 @@ define i64 @select_i64_ugt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a2, a0
 ; RV32-XQCICM-NEXT:    sltu a2, a3, a1
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_ugt:
@@ -1050,10 +1050,10 @@ define i64 @select_i64_ugt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a2, a0
 ; RV32IXQCI-NEXT:    sltu a2, a3, a1
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_ugt:
@@ -1111,10 +1111,10 @@ define i64 @select_i64_uge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a0, a2
 ; RV32-XQCICM-NEXT:    sltu a2, a1, a3
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_uge:
@@ -1136,10 +1136,10 @@ define i64 @select_i64_uge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a0, a2
 ; RV32IXQCI-NEXT:    sltu a2, a1, a3
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_uge:
@@ -1197,10 +1197,10 @@ define i64 @select_i64_ult(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a0, a2
 ; RV32-XQCICM-NEXT:    sltu a2, a1, a3
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_ult:
@@ -1222,10 +1222,10 @@ define i64 @select_i64_ult(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a0, a2
 ; RV32IXQCI-NEXT:    sltu a2, a1, a3
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_ult:
@@ -1283,10 +1283,10 @@ define i64 @select_i64_ule(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a2, a0
 ; RV32-XQCICM-NEXT:    sltu a2, a3, a1
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_ule:
@@ -1308,10 +1308,10 @@ define i64 @select_i64_ule(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a2, a0
 ; RV32IXQCI-NEXT:    sltu a2, a3, a1
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_ule:
@@ -1369,10 +1369,10 @@ define i64 @select_i64_sgt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a2, a0
 ; RV32-XQCICM-NEXT:    slt a2, a3, a1
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_sgt:
@@ -1394,10 +1394,10 @@ define i64 @select_i64_sgt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a2, a0
 ; RV32IXQCI-NEXT:    slt a2, a3, a1
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_sgt:
@@ -1455,10 +1455,10 @@ define i64 @select_i64_sge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a0, a2
 ; RV32-XQCICM-NEXT:    slt a2, a1, a3
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_sge:
@@ -1480,10 +1480,10 @@ define i64 @select_i64_sge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a0, a2
 ; RV32IXQCI-NEXT:    slt a2, a1, a3
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_sge:
@@ -1541,10 +1541,10 @@ define i64 @select_i64_slt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a0, a2
 ; RV32-XQCICM-NEXT:    slt a2, a1, a3
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_slt:
@@ -1566,10 +1566,10 @@ define i64 @select_i64_slt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a0, a2
 ; RV32IXQCI-NEXT:    slt a2, a1, a3
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_slt:
@@ -1627,10 +1627,10 @@ define i64 @select_i64_sle(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a2, a0
 ; RV32-XQCICM-NEXT:    slt a2, a3, a1
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_sle:
@@ -1652,10 +1652,10 @@ define i64 @select_i64_sle(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a2, a0
 ; RV32IXQCI-NEXT:    slt a2, a3, a1
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_sle:
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index 19fade6..8273c65 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1153,8 +1153,8 @@ define i32 @select_sub_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_sub_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    sub a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = sub i32 %a, %b
@@ -1301,9 +1301,9 @@ define i32 @select_sub_4(i1 zeroext %cond, i32 %x) {
 ;
 ; RV32IXQCI-LABEL: select_sub_4:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    addi a1, a1, -128
-; RV32IXQCI-NEXT:    li a2, 128
-; RV32IXQCI-NEXT:    qc.mvnei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    addi a2, a1, -128
+; RV32IXQCI-NEXT:    li a1, 128
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %add = sub i32 %x, 128
@@ -1348,8 +1348,8 @@ define i32 @select_and_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_and_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    and a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = and i32 %a, %b
@@ -1493,8 +1493,8 @@ define i32 @select_udiv_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_udiv_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    divu a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = udiv i32 %a, %b
@@ -1682,8 +1682,8 @@ define i32 @select_shl_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_shl_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    sll a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = shl i32 %a, %b
@@ -1798,8 +1798,8 @@ define i32 @select_ashr_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_ashr_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    sra a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = ashr i32 %a, %b
@@ -1914,8 +1914,8 @@ define i32 @select_lshr_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_lshr_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    srl a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = lshr i32 %a, %b
@@ -2371,9 +2371,9 @@ define i32 @select_cst5(i1 zeroext %cond) {
 ; RV32IXQCI-LABEL: select_cst5:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    lui a1, 1
-; RV32IXQCI-NEXT:    addi a1, a1, -2047
-; RV32IXQCI-NEXT:    li a2, 2047
-; RV32IXQCI-NEXT:    qc.mvnei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    addi a2, a1, -2047
+; RV32IXQCI-NEXT:    li a1, 2047
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %ret = select i1 %cond, i32 2047, i32 2049
@@ -2870,8 +2870,8 @@ define void @select_redundant_czero_eqz1(ptr %0, ptr %1) {
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
 ; RV32IXQCI-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
-; RV32IXQCI-NEXT:    qc.mveqi a0, a0, 0, a2
-; RV32IXQCI-NEXT:    sw a0, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a0
+; RV32IXQCI-NEXT:    sw a2, 0(a1)
 ; RV32IXQCI-NEXT:    ret
 entry:
   %3 = icmp eq ptr %0, null
diff --git a/llvm/test/CodeGen/RISCV/xqcicm.ll b/llvm/test/CodeGen/RISCV/xqcicm.ll
index 1741be7..fb48301 100644
--- a/llvm/test/CodeGen/RISCV/xqcicm.ll
+++ b/llvm/test/CodeGen/RISCV/xqcicm.ll
@@ -23,15 +23,15 @@ define i32 @select_example(i32 %cond, i32 %x, i32 %y) {
 ; RV32IXQCICM-LABEL: select_example:
 ; RV32IXQCICM:       # %bb.0: # %entry
 ; RV32IXQCICM-NEXT:    andi a0, a0, 1
-; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCICM-NEXT:    mv a0, a2
+; RV32IXQCICM-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCICM-NEXT:    mv a0, a1
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_example:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cond_trunc = trunc i32 %cond to i1
@@ -52,14 +52,14 @@ define i32 @select_cc_example_eq(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 11
@@ -80,14 +80,14 @@ define i32 @select_cc_example_eq1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 11, %a
@@ -108,14 +108,14 @@ define i32 @select_cc_example_ne(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 %a, 11
@@ -136,14 +136,14 @@ define i32 @select_cc_example_ne1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 11, %a
@@ -164,14 +164,14 @@ define i32 @select_cc_example_slt(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_slt:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlti a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_slt:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlti a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp slt i32 %a, 11
@@ -192,14 +192,14 @@ define i32 @select_cc_example_slt1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_slt1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgei a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlti a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_slt1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgei a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlti a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp slt i32 11, %a
@@ -220,14 +220,14 @@ define i32 @select_cc_example_sle(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sle:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlti a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgei a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sle:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlti a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgei a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sle i32 %a, 11
@@ -248,14 +248,14 @@ define i32 @select_cc_example_sle1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sle1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlti a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sle1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlti a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sle i32 11, %a
@@ -276,14 +276,14 @@ define i32 @select_cc_example_sgt(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sgt:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgei a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlti a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sgt:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgei a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlti a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 11
@@ -304,14 +304,14 @@ define i32 @select_cc_example_sgt1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sgt1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlti a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sgt1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlti a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 11, %a
@@ -332,14 +332,14 @@ define i32 @select_cc_example_sge(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sge:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlti a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sge:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlti a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sge i32 %a, 11
@@ -360,14 +360,14 @@ define i32 @select_cc_example_sge1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sge1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlti a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgei a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sge1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlti a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgei a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sge i32 11, %a
@@ -388,14 +388,14 @@ define i32 @select_cc_example_ule(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ule:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltui a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeui a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ule:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltui a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeui a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ule i32 %a, 11
@@ -416,14 +416,14 @@ define i32 @select_cc_example_ule1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ule1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeui a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltui a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ule1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeui a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltui a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ule i32 11, %a
@@ -444,14 +444,14 @@ define i32 @select_cc_example_ugt(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ugt:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeui a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltui a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ugt:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeui a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltui a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ugt i32 %a, 11
@@ -472,14 +472,14 @@ define i32 @select_cc_example_ugt1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ugt1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltui a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeui a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ugt1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltui a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeui a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ugt i32 11, %a
@@ -500,14 +500,14 @@ define i32 @select_cc_example_ult(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ult:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltui a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeui a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ult:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltui a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeui a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ult i32 %a, 11
@@ -528,14 +528,14 @@ define i32 @select_cc_example_ult1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ult1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeui a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltui a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ult1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeui a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltui a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ult i32 11, %a
@@ -556,14 +556,14 @@ define i32 @select_cc_example_uge(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_uge:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeui a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltui a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_uge:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeui a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltui a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp uge i32 %a, 11
@@ -584,14 +584,14 @@ define i32 @select_cc_example_uge1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_uge1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltui a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeui a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_uge1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltui a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeui a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp uge i32 11, %a
@@ -611,14 +611,14 @@ define i32 @select_cc_example_eq_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveq a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvne a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveq a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvne a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, %b
@@ -638,14 +638,14 @@ define i32 @select_cc_example_ne_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvne a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveq a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvne a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveq a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 %a, %b
@@ -665,14 +665,14 @@ define i32 @select_cc_example_slt_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_slt_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlt a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvge a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_slt_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlt a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvge a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp slt i32 %a, %b
@@ -692,14 +692,14 @@ define i32 @select_cc_example_sge_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sge_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvge a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlt a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sge_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvge a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sge i32 %a, %b
@@ -719,14 +719,14 @@ define i32 @select_cc_example_sgt_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sgt_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlt a3, a1, a0, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvge a2, a1, a0, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sgt_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlt a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvge a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, %b
@@ -746,14 +746,14 @@ define i32 @select_cc_example_sle_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sle_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvge a3, a1, a0, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlt a2, a1, a0, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sle_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvge a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sle i32 %a, %b
@@ -773,14 +773,14 @@ define i32 @select_cc_example_ugt_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ugt_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltu a3, a1, a0, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeu a2, a1, a0, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ugt_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltu a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ugt i32 %a, %b
@@ -800,14 +800,14 @@ define i32 @select_cc_example_ult_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ult_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ult_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ult i32 %a, %b
@@ -827,14 +827,14 @@ define i32 @select_cc_example_uge_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_uge_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeu a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltu a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_uge_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp uge i32 %a, %b
@@ -854,14 +854,14 @@ define i32 @select_cc_example_ule_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ule_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeu a3, a1, a0, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltu a2, a1, a0, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ule_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeu a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltu a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ule i32 %a, %b
@@ -883,18 +883,263 @@ define i32 @select_cc_example_ule_neg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ; RV32IXQCICM-LABEL: select_cc_example_ule_neg:
 ; RV32IXQCICM:       # %bb.0: # %entry
 ; RV32IXQCICM-NEXT:    li a1, -10
-; RV32IXQCICM-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ule_neg:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    li a1, -10
-; RV32IXQCI-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ule i32 %a, -11
   %sel = select i1 %cmp, i32 %x, i32 %y
   ret i32 %sel
 }
+
+define i32 @select_cc_example_eq_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beq a2, a1, .LBB32_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB32_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_eq_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvne a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_eq_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvne a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_lt_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_lt_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    blt a2, a1, .LBB33_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB33_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_lt_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvge a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_lt_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvge a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ge_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ge_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bge a2, a1, .LBB34_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB34_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ge_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvlt a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ge_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvlt a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ult_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ult_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bltu a2, a1, .LBB35_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB35_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ult_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvgeu a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ult_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvgeu a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_uge_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_uge_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bgeu a2, a1, .LBB36_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB36_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_uge_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvltu a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_uge_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvltu a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eq_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    beq a2, a1, .LBB37_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB37_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_eq_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvnei a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_eq_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvnei a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_lt_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_lt_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    blt a2, a1, .LBB38_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB38_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_lt_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvgei a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_lt_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvgei a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ge_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ge_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 10
+; RV32I-NEXT:    blt a1, a2, .LBB39_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB39_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ge_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvlti a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ge_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvlti a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ult_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ult_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    bltu a2, a1, .LBB40_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB40_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ult_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvgeui a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ult_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvgeui a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_uge_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_uge_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 10
+; RV32I-NEXT:    bltu a1, a2, .LBB41_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB41_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_uge_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvltui a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_uge_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvltui a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
diff --git a/llvm/test/CodeGen/RISCV/xqcics.ll b/llvm/test/CodeGen/RISCV/xqcics.ll
index 38de8fb..5b7ca9e7 100644
--- a/llvm/test/CodeGen/RISCV/xqcics.ll
+++ b/llvm/test/CodeGen/RISCV/xqcics.ll
@@ -134,14 +134,14 @@ define i32 @select_cc_example_eq(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 11
@@ -167,14 +167,14 @@ define i32 @select_cc_example_eq_c(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq_c:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq_c:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 11, %a
@@ -200,14 +200,14 @@ define i32 @select_cc_example_ne(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 %a, 11
@@ -233,14 +233,14 @@ define i32 @select_cc_example_ne_c(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne_c:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne_c:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 11, %a
diff --git a/llvm/test/CodeGen/SPIRV/capability-FloatControl2.ll b/llvm/test/CodeGen/SPIRV/capability-FloatControl2.ll
index aa60e13..b4e283e 100644
--- a/llvm/test/CodeGen/SPIRV/capability-FloatControl2.ll
+++ b/llvm/test/CodeGen/SPIRV/capability-FloatControl2.ll
@@ -8,7 +8,7 @@
 
 ; CHECK-EXT: OpCapability FloatControls2
 ; CHECK-EXT: OpExtension "SPV_KHR_float_controls2"
-; CHECK-EXT: OpDecorate {{%[0-9]+}} FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|Fast
+; CHECK-EXT: OpDecorate {{%[0-9]+}} FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
 
 define hidden spir_func float @foo(float  %0) local_unnamed_addr {
   %2 = fmul reassoc nnan ninf nsz arcp afn float %0, 2.000000e+00
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/decoration.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/decoration.ll
new file mode 100644
index 0000000..d3fe9e4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/decoration.ll
@@ -0,0 +1,148 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+
+; CHECK: OpName %[[#addRes:]] "addRes"
+; CHECK: OpName %[[#subRes:]] "subRes"
+; CHECK: OpName %[[#mulRes:]] "mulRes"
+; CHECK: OpName %[[#divRes:]] "divRes"
+; CHECK: OpName %[[#remRes:]] "remRes"
+; CHECK: OpName %[[#negRes:]] "negRes"
+; CHECK: OpName %[[#oeqRes:]] "oeqRes"
+; CHECK: OpName %[[#oneRes:]] "oneRes"
+; CHECK: OpName %[[#oltRes:]] "oltRes"
+; CHECK: OpName %[[#ogtRes:]] "ogtRes"
+; CHECK: OpName %[[#oleRes:]] "oleRes"
+; CHECK: OpName %[[#ogeRes:]] "ogeRes"
+; CHECK: OpName %[[#ordRes:]] "ordRes"
+; CHECK: OpName %[[#ueqRes:]] "ueqRes"
+; CHECK: OpName %[[#uneRes:]] "uneRes"
+; CHECK: OpName %[[#ultRes:]] "ultRes"
+; CHECK: OpName %[[#ugtRes:]] "ugtRes"
+; CHECK: OpName %[[#uleRes:]] "uleRes"
+; CHECK: OpName %[[#ugeRes:]] "ugeRes"
+; CHECK: OpName %[[#unoRes:]] "unoRes"
+; CHECK: OpName %[[#modRes:]] "modRes"
+; CHECK: OpName %[[#maxRes:]] "maxRes"
+; CHECK: OpName %[[#maxCommonRes:]] "maxCommonRes"
+; CHECK: OpName %[[#addResV:]] "addResV"
+; CHECK: OpName %[[#subResV:]] "subResV"
+; CHECK: OpName %[[#mulResV:]] "mulResV"
+; CHECK: OpName %[[#divResV:]] "divResV"
+; CHECK: OpName %[[#remResV:]] "remResV"
+; CHECK: OpName %[[#negResV:]] "negResV"
+; CHECK: OpName %[[#oeqResV:]] "oeqResV"
+; CHECK: OpName %[[#oneResV:]] "oneResV"
+; CHECK: OpName %[[#oltResV:]] "oltResV"
+; CHECK: OpName %[[#ogtResV:]] "ogtResV"
+; CHECK: OpName %[[#oleResV:]] "oleResV"
+; CHECK: OpName %[[#ogeResV:]] "ogeResV"
+; CHECK: OpName %[[#ordResV:]] "ordResV"
+; CHECK: OpName %[[#ueqResV:]] "ueqResV"
+; CHECK: OpName %[[#uneResV:]] "uneResV"
+; CHECK: OpName %[[#ultResV:]] "ultResV"
+; CHECK: OpName %[[#ugtResV:]] "ugtResV"
+; CHECK: OpName %[[#uleResV:]] "uleResV"
+; CHECK: OpName %[[#ugeResV:]] "ugeResV"
+; CHECK: OpName %[[#unoResV:]] "unoResV"
+; CHECK: OpName %[[#modResV:]] "modResV"
+; CHECK: OpName %[[#maxResV:]] "maxResV"
+; CHECK: OpName %[[#maxCommonResV:]] "maxCommonResV"
+; CHECK: OpDecorate %[[#subRes]] FPFastMathMode NotNaN
+; CHECK: OpDecorate %[[#mulRes]] FPFastMathMode NotInf
+; CHECK: OpDecorate %[[#divRes]] FPFastMathMode NSZ
+; CHECK: OpDecorate %[[#remRes]] FPFastMathMode AllowRecip
+; CHECK: OpDecorate %[[#negRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#oeqRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#oltRes]] FPFastMathMode NotNaN
+; CHECK: OpDecorate %[[#ogtRes]] FPFastMathMode NotInf
+; CHECK: OpDecorate %[[#oleRes]] FPFastMathMode NSZ
+; CHECK: OpDecorate %[[#ogeRes]] FPFastMathMode AllowRecip
+; CHECK: OpDecorate %[[#ordRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#ueqRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#maxRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#maxCommonRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#subResV]] FPFastMathMode NotNaN
+; CHECK: OpDecorate %[[#mulResV]] FPFastMathMode NotInf
+; CHECK: OpDecorate %[[#divResV]] FPFastMathMode NSZ
+; CHECK: OpDecorate %[[#remResV]] FPFastMathMode AllowRecip
+; CHECK: OpDecorate %[[#negResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#oeqResV]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#oltResV]] FPFastMathMode NotNaN
+; CHECK: OpDecorate %[[#ogtResV]] FPFastMathMode NotInf
+; CHECK: OpDecorate %[[#oleResV]] FPFastMathMode NSZ
+; CHECK: OpDecorate %[[#ogeResV]] FPFastMathMode AllowRecip
+; CHECK: OpDecorate %[[#ordResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#ueqResV]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#maxResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#maxCommonResV]] FPFastMathMode NotNaN|NotInf
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare spir_func float @_Z4fmodff(float, float)
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fmaxff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z23__spirv_ocl_fmax_commonff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare spir_func <2 x float> @_Z4fmodDv2_fDv2_f(<2 x float>, <2 x float>)
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fmaxDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z23__spirv_ocl_fmax_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define weak_odr dso_local spir_kernel void @foo(float %1, float %2) {
+entry:
+  %addRes = fadd float %1,  %2
+  %subRes = fsub nnan float %1,  %2
+  %mulRes = fmul ninf float %1,  %2
+  %divRes = fdiv nsz float %1,  %2
+  %remRes = frem arcp float %1,  %2
+  %negRes = fneg fast float %1
+  %oeqRes = fcmp nnan ninf oeq float %1,  %2
+  %oneRes = fcmp one float %1,  %2, !spirv.Decorations !3
+  %oltRes = fcmp nnan olt float %1,  %2, !spirv.Decorations !3
+  %ogtRes = fcmp ninf ogt float %1,  %2, !spirv.Decorations !3
+  %oleRes = fcmp nsz ole float %1,  %2, !spirv.Decorations !3
+  %ogeRes = fcmp arcp oge float %1,  %2, !spirv.Decorations !3
+  %ordRes = fcmp fast ord float %1,  %2, !spirv.Decorations !3
+  %ueqRes = fcmp nnan ninf ueq float %1,  %2, !spirv.Decorations !3
+  %uneRes = fcmp une float %1,  %2, !spirv.Decorations !3
+  %ultRes = fcmp ult float %1,  %2, !spirv.Decorations !3
+  %ugtRes = fcmp ugt float %1,  %2, !spirv.Decorations !3
+  %uleRes = fcmp ule float %1,  %2, !spirv.Decorations !3
+  %ugeRes = fcmp uge float %1,  %2, !spirv.Decorations !3
+  %unoRes = fcmp uno float %1,  %2, !spirv.Decorations !3
+  %modRes = call spir_func float @_Z4fmodff(float %1, float %2)
+  %maxRes = tail call fast spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fmaxff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+   %maxCommonRes = tail call spir_func noundef float @_Z23__spirv_ocl_fmax_commonff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+  ret void
+}
+
+define weak_odr dso_local spir_kernel void @fooV(<2 x float> %v1, <2 x float> %v2) {
+  %addResV = fadd <2 x float> %v1,  %v2
+  %subResV = fsub nnan <2 x float> %v1,  %v2
+  %mulResV = fmul ninf <2 x float> %v1,  %v2
+  %divResV = fdiv nsz <2 x float> %v1,  %v2
+  %remResV = frem arcp <2 x float> %v1,  %v2
+  %negResV = fneg fast <2 x float> %v1
+  %oeqResV = fcmp nnan ninf oeq <2 x float> %v1,  %v2
+  %oneResV = fcmp one <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %oltResV = fcmp nnan olt <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ogtResV = fcmp ninf ogt <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %oleResV = fcmp nsz ole <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ogeResV = fcmp arcp oge <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ordResV = fcmp fast ord <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ueqResV = fcmp nnan ninf ueq <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %uneResV = fcmp une <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ultResV = fcmp ult <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ugtResV = fcmp ugt <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %uleResV = fcmp ule <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ugeResV = fcmp uge <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %unoResV = fcmp uno <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %modResV = call spir_func <2 x float> @_Z4fmodDv2_fDv2_f(<2 x float> %v1, <2 x float> %v2)
+  %maxResV = tail call fast spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fmaxDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+   %maxCommonResV = tail call spir_func noundef <2 x float> @_Z23__spirv_ocl_fmax_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+  ret void
+}
+
+!3 = !{!5, !4}
+!4 = !{i32 42} ; 42 is NoContraction decoration
+!5 = !{i32 40, i32 393216} ; 40 is FPFastMathMode
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode.ll
new file mode 100644
index 0000000..4b3c13c2
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode.ll
@@ -0,0 +1,81 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+
+define dso_local dllexport spir_kernel void @k_float_controls_half(half %h) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_bfloat(bfloat %b) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_float(float %f) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_double(double %d) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all(half %h, bfloat %b, float %f, double %d) {
+entry:
+  ret void
+}
+
+!spirv.ExecutionMode = !{!17, !18, !19, !20, !22, !23, !24, !25}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_HALF:]] "k_float_controls_half"
+!0 = !{ptr @k_float_controls_half, !"k_float_controls_half", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_BFLOAT:]] "k_float_controls_bfloat"
+!1 = !{ptr @k_float_controls_bfloat, !"k_float_controls_bfloat", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT:]] "k_float_controls_float"
+!2 = !{ptr @k_float_controls_float, !"k_float_controls_float", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_DOUBLE:]] "k_float_controls_double"
+!3 = !{ptr @k_float_controls_double, !"k_float_controls_double", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL:]] "k_float_controls_all"
+!5 = !{ptr @k_float_controls_all, !"k_float_controls_all", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+!6 = !{i32 2, i32 2}
+!7 = !{i32 32, i32 36}
+!8 = !{i32 0, i32 0}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_HALF]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST1:]]
+!17 = !{ptr @k_float_controls_half, i32 6028, half poison, i32 1}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_BFLOAT]] FPFastMathDefault %[[#BFLOAT_TYPE:]] %[[#CONST2:]]
+!18 = !{ptr @k_float_controls_bfloat, i32 6028, bfloat poison, i32 2}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST4:]]
+!19 = !{ptr @k_float_controls_float, i32 6028, float poison, i32 4}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_DOUBLE]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST7:]]
+!20 = !{ptr @k_float_controls_double, i32 6028, double poison, i32 7}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#HALF_TYPE]] %[[#CONST131072:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#FLOAT_TYPE]] %[[#CONST458752:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#DOUBLE_TYPE]] %[[#CONST458752:]]
+!22 = !{ptr @k_float_controls_all, i32 6028, half poison, i32 131072}
+!23 = !{ptr @k_float_controls_all, i32 6028, bfloat poison, i32 131072}
+!24 = !{ptr @k_float_controls_all, i32 6028, float poison, i32 458752}
+!25 = !{ptr @k_float_controls_all, i32 6028, double poison, i32 458752}
+
+; CHECK-DAG: %[[#INT32_TYPE:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#HALF_TYPE]] = OpTypeFloat 16
+; CHECK-DAG: %[[#FLOAT_TYPE]] = OpTypeFloat 32
+; CHECK-DAG: %[[#DOUBLE_TYPE]] = OpTypeFloat 64
+; CHECK-DAG: %[[#CONST1]] = OpConstant %[[#INT32_TYPE]] 1
+; CHECK-DAG: %[[#CONST2]] = OpConstant %[[#INT32_TYPE]] 2
+; CHECK-DAG: %[[#CONST4]] = OpConstant %[[#INT32_TYPE]] 4
+; CHECK-DAG: %[[#CONST7]] = OpConstant %[[#INT32_TYPE]] 7
+; CHECK-DAG: %[[#CONST131072]] = OpConstant %[[#INT32_TYPE]] 131072
+; CHECK-DAG: %[[#CONST458752]] = OpConstant %[[#INT32_TYPE]] 458752
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode2.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode2.ll
new file mode 100644
index 0000000..c063272
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode2.ll
@@ -0,0 +1,73 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT:]] "k_float_controls_float"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL:]] "k_float_controls_all"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT_V:]] "k_float_controls_float_v"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL_V:]] "k_float_controls_all_v"
+
+define dso_local dllexport spir_kernel void @k_float_controls_float(float %f) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all(half %h, float %f, double %d) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_float_v(<2 x float> %f) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all_v(<2 x half> %h, <2 x float> %f, <2 x double> %d) {
+entry:
+  ret void
+}
+
+!spirv.ExecutionMode = !{!19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079:]]
+!19 = !{ptr @k_float_controls_float, i32 6028, float poison, i32 131079}
+; We expect 130179 for float type.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; We expect 0 for the rest of types because it's SignedZeroInfNanPreserve.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST0:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST0]]
+!20 = !{ptr @k_float_controls_all, i32 6028, float poison, i32 131079}
+; ContractionOff is now replaced with FPFastMathDefault with AllowContract bit set to false.
+!21 = !{ptr @k_float_controls_float, i32 31}
+!22 = !{ptr @k_float_controls_all, i32 31}
+; SignedZeroInfNanPreserve is now replaced with FPFastMathDefault with flags 0.
+!23 = !{ptr @k_float_controls_float, i32 4461, i32 32}
+!24 = !{ptr @k_float_controls_all, i32 4461, i32 16}
+!25 = !{ptr @k_float_controls_all, i32 4461, i32 32}
+!26 = !{ptr @k_float_controls_all, i32 4461, i32 64}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT_V]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+!27 = !{ptr @k_float_controls_float_v, i32 6028, float poison, i32 131079}
+; We expect 130179 for float type.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; We expect 0 for the rest of types because it's SignedZeroInfNanPreserve.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST0]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST0]]
+!28 = !{ptr @k_float_controls_all_v, i32 6028, float poison, i32 131079}
+; ContractionOff is now replaced with FPFastMathDefault with AllowContract bit set to false.
+!29 = !{ptr @k_float_controls_float_v, i32 31}
+!30 = !{ptr @k_float_controls_all_v, i32 31}
+; SignedZeroInfNanPreserve is now replaced with FPFastMathDefault with flags 0.
+!31 = !{ptr @k_float_controls_float_v, i32 4461, i32 32}
+!32 = !{ptr @k_float_controls_all_v, i32 4461, i32 16}
+!33 = !{ptr @k_float_controls_all_v, i32 4461, i32 32}
+!34 = !{ptr @k_float_controls_all_v, i32 4461, i32 64}
+
+; CHECK-DAG: %[[#INT32_TYPE:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#HALF_TYPE]] = OpTypeFloat 16
+; CHECK-DAG: %[[#FLOAT_TYPE]] = OpTypeFloat 32
+; CHECK-DAG: %[[#DOUBLE_TYPE]] = OpTypeFloat 64
+; CHECK-DAG: %[[#CONST0]] = OpConstantNull %[[#INT32_TYPE]]
+; CHECK-DAG: %[[#CONST131079]] = OpConstant %[[#INT32_TYPE]] 131079
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode3.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode3.ll
new file mode 100644
index 0000000..1d09187
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode3.ll
@@ -0,0 +1,103 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT:]] "k_float_controls_float"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL:]] "k_float_controls_all"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT_V:]] "k_float_controls_float_v"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL_V:]] "k_float_controls_all_v"
+
+; We expect 130179 for float type.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; We expect 0 for the rest of types because it's SignedZeroInfNanPreserve.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST0:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST0]]
+
+; We expect 130179 for float type.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT_V]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; We expect 0 for the rest of types because it's SignedZeroInfNanPreserve.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST0]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST0]]
+
+; CHECK-DAG: OpDecorate %[[#addRes:]] FPFastMathMode NotNaN|NotInf|NSZ|AllowReassoc
+; CHECK-DAG: OpDecorate %[[#addResH:]] FPFastMathMode None
+; CHECK-DAG: OpDecorate %[[#addResF:]] FPFastMathMode NotNaN|NotInf|NSZ|AllowReassoc
+; CHECK-DAG: OpDecorate %[[#addResD:]] FPFastMathMode None
+; CHECK-DAG: OpDecorate %[[#addRes_V:]] FPFastMathMode NotNaN|NotInf|NSZ|AllowReassoc
+; CHECK-DAG: OpDecorate %[[#addResH_V:]] FPFastMathMode None
+; CHECK-DAG: OpDecorate %[[#addResF_V:]] FPFastMathMode NotNaN|NotInf|NSZ|AllowReassoc
+; CHECK-DAG: OpDecorate %[[#addResD_V:]] FPFastMathMode None
+
+; CHECK-DAG: %[[#INT32_TYPE:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#HALF_TYPE]] = OpTypeFloat 16
+; CHECK-DAG: %[[#FLOAT_TYPE]] = OpTypeFloat 32
+; CHECK-DAG: %[[#DOUBLE_TYPE]] = OpTypeFloat 64
+; CHECK-DAG: %[[#CONST0]] = OpConstantNull %[[#INT32_TYPE]]
+; CHECK-DAG: %[[#CONST131079]] = OpConstant %[[#INT32_TYPE]] 131079
+
+; CHECK-DAG: %[[#HALF_V_TYPE:]] = OpTypeVector %[[#HALF_TYPE]]
+; CHECK-DAG: %[[#FLOAT_V_TYPE:]] = OpTypeVector %[[#FLOAT_TYPE]]
+; CHECK-DAG: %[[#DOUBLE_V_TYPE:]] = OpTypeVector %[[#DOUBLE_TYPE]]
+
+define dso_local dllexport spir_kernel void @k_float_controls_float(float %f) {
+entry:
+; CHECK-DAG: %[[#addRes]] = OpFAdd %[[#FLOAT_TYPE]]
+  %addRes = fadd float %f,  %f
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all(half %h, float %f, double %d) {
+entry:
+; CHECK-DAG: %[[#addResH]] = OpFAdd %[[#HALF_TYPE]]
+; CHECK-DAG: %[[#addResF]] = OpFAdd %[[#FLOAT_TYPE]]
+; CHECK-DAG: %[[#addResD]] = OpFAdd %[[#DOUBLE_TYPE]]
+  %addResH = fadd half %h,  %h
+  %addResF = fadd float %f,  %f
+  %addResD = fadd double %d,  %d
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_float_v(<2 x float> %f) {
+entry:
+; CHECK-DAG: %[[#addRes_V]] = OpFAdd %[[#FLOAT_V_TYPE]]
+  %addRes = fadd <2 x float> %f,  %f
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all_v(<2 x half> %h, <2 x float> %f, <2 x double> %d) {
+entry:
+; CHECK-DAG: %[[#addResH_V]] = OpFAdd %[[#HALF_V_TYPE]]
+; CHECK-DAG: %[[#addResF_V]] = OpFAdd %[[#FLOAT_V_TYPE]]
+; CHECK-DAG: %[[#addResD_V]] = OpFAdd %[[#DOUBLE_V_TYPE]]
+  %addResH = fadd <2 x half> %h,  %h
+  %addResF = fadd <2 x float> %f,  %f
+  %addResD = fadd <2 x double> %d,  %d
+  ret void
+}
+
+!spirv.ExecutionMode = !{!19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34}
+
+!19 = !{ptr @k_float_controls_float, i32 6028, float poison, i32 131079}
+!20 = !{ptr @k_float_controls_all, i32 6028, float poison, i32 131079}
+; ContractionOff is now replaced with FPFastMathDefault with AllowContract bit set to false.
+!21 = !{ptr @k_float_controls_float, i32 31}
+!22 = !{ptr @k_float_controls_all, i32 31}
+; SignedZeroInfNanPreserve is now replaced with FPFastMathDefault with flags 0.
+!23 = !{ptr @k_float_controls_float, i32 4461, i32 32}
+!24 = !{ptr @k_float_controls_all, i32 4461, i32 16}
+!25 = !{ptr @k_float_controls_all, i32 4461, i32 32}
+!26 = !{ptr @k_float_controls_all, i32 4461, i32 64}
+
+!27 = !{ptr @k_float_controls_float_v, i32 6028, float poison, i32 131079}
+!28 = !{ptr @k_float_controls_all_v, i32 6028, float poison, i32 131079}
+; ContractionOff is now replaced with FPFastMathDefault with AllowContract bit set to false.
+!29 = !{ptr @k_float_controls_float_v, i32 31}
+!30 = !{ptr @k_float_controls_all_v, i32 31}
+; SignedZeroInfNanPreserve is now replaced with FPFastMathDefault with flags 0.
+!31 = !{ptr @k_float_controls_float_v, i32 4461, i32 32}
+!32 = !{ptr @k_float_controls_all_v, i32 4461, i32 16}
+!33 = !{ptr @k_float_controls_all_v, i32 4461, i32 32}
+!34 = !{ptr @k_float_controls_all_v, i32 4461, i32 64}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/replacements.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/replacements.ll
new file mode 100644
index 0000000..bba1c93
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/replacements.ll
@@ -0,0 +1,61 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - -filetype=obj | spirv-val %}
+
+;; This test checks that the OpenCL.std instructions fmin_common, fmax_common are replaced with fmin, fmax with NInf and NNaN instead.
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+
+; CHECK: OpName %[[#maxRes:]] "maxRes"
+; CHECK: OpName %[[#maxCommonRes:]] "maxCommonRes"
+; CHECK: OpName %[[#minRes:]] "minRes"
+; CHECK: OpName %[[#minCommonRes:]] "minCommonRes"
+; CHECK: OpName %[[#maxResV:]] "maxResV"
+; CHECK: OpName %[[#maxCommonResV:]] "maxCommonResV"
+; CHECK: OpName %[[#minResV:]] "minResV"
+; CHECK: OpName %[[#minCommonResV:]] "minCommonResV"
+; CHECK: OpDecorate %[[#maxRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#maxCommonRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#minRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#minCommonRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#maxResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#maxCommonResV]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#minResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#minCommonResV]] FPFastMathMode NotNaN|NotInf
+; CHECK: %[[#maxRes]] = OpExtInst {{.*}} fmax
+; CHECK: %[[#maxCommonRes]] = OpExtInst {{.*}} fmax
+; CHECK: %[[#minRes]] = OpExtInst {{.*}} fmin
+; CHECK: %[[#minCommonRes]] = OpExtInst {{.*}} fmin
+; CHECK: %[[#maxResV]] = OpExtInst {{.*}} fmax
+; CHECK: %[[#maxCommonResV]] = OpExtInst {{.*}} fmax
+; CHECK: %[[#minResV]] = OpExtInst {{.*}} fmin
+; CHECK: %[[#minCommonResV]] = OpExtInst {{.*}} fmin
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare spir_func float @_Z4fmodff(float, float)
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fmaxff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z23__spirv_ocl_fmax_commonff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fminff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z23__spirv_ocl_fmin_commonff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fmaxDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z23__spirv_ocl_fmax_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fminDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z23__spirv_ocl_fmin_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define weak_odr dso_local spir_kernel void @foo(float %1, float %2) {
+entry:
+  %maxRes = tail call fast spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fmaxff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+   %maxCommonRes = tail call spir_func noundef float @_Z23__spirv_ocl_fmax_commonff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+  %minRes = tail call fast spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fminff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+   %minCommonRes = tail call spir_func noundef float @_Z23__spirv_ocl_fmin_commonff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+  ret void
+}
+
+define weak_odr dso_local spir_kernel void @fooV(<2 x float> %v1, <2 x float> %v2) {
+  %maxResV = tail call fast spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fmaxDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+   %maxCommonResV = tail call spir_func noundef <2 x float> @_Z23__spirv_ocl_fmax_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+  %minResV = tail call fast spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fminDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+   %minCommonResV = tail call spir_func noundef <2 x float> @_Z23__spirv_ocl_fmin_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/instructions/integer-casts.ll b/llvm/test/CodeGen/SPIRV/instructions/integer-casts.ll
index 6a4b4f5..5fe2cc8 100644
--- a/llvm/test/CodeGen/SPIRV/instructions/integer-casts.ll
+++ b/llvm/test/CodeGen/SPIRV/instructions/integer-casts.ll
@@ -14,11 +14,11 @@
 ; CHECK-DAG: OpName [[ZEXT8_16:%.*]] "u8tou16"
 ; CHECK-DAG: OpName [[ZEXT16_32:%.*]] "u16tou32"
 
+; CHECK-DAG: OpName %[[#R16:]] "r16"
 ; CHECK-DAG: OpName %[[#R17:]] "r17"
 ; CHECK-DAG: OpName %[[#R18:]] "r18"
 ; CHECK-DAG: OpName %[[#R19:]] "r19"
 ; CHECK-DAG: OpName %[[#R20:]] "r20"
-; CHECK-DAG: OpName %[[#R21:]] "r21"
 
 ; CHECK-DAG: OpName [[TRUNC32_16v4:%.*]] "i32toi16v4"
 ; CHECK-DAG: OpName [[TRUNC32_8v4:%.*]] "i32toi8v4"
@@ -30,11 +30,11 @@
 ; CHECK-DAG: OpName [[ZEXT8_16v4:%.*]] "u8tou16v4"
 ; CHECK-DAG: OpName [[ZEXT16_32v4:%.*]] "u16tou32v4"
 
-; CHECK-DAG: OpDecorate %[[#R17]] FPRoundingMode RTZ
-; CHECK-DAG: OpDecorate %[[#R18]] FPRoundingMode RTE
-; CHECK-DAG: OpDecorate %[[#R19]] FPRoundingMode RTP
-; CHECK-DAG: OpDecorate %[[#R20]] FPRoundingMode RTN
-; CHECK-DAG: OpDecorate %[[#R21]] SaturatedConversion
+; CHECK-DAG: OpDecorate %[[#R16]] FPRoundingMode RTZ
+; CHECK-DAG: OpDecorate %[[#R17]] FPRoundingMode RTE
+; CHECK-DAG: OpDecorate %[[#R18]] FPRoundingMode RTP
+; CHECK-DAG: OpDecorate %[[#R19]] FPRoundingMode RTN
+; CHECK-DAG: OpDecorate %[[#R20]] SaturatedConversion
 
 ; CHECK-DAG: [[F32:%.*]] = OpTypeFloat 32
 ; CHECK-DAG: [[F16:%.*]] = OpTypeFloat 16
@@ -258,7 +258,6 @@ define <4 x i32>  @u16tou32v4(<4 x i16> %a) {
 ; CHECK: %[[#]] = OpUConvert [[U32]] %[[#]]
 ; CHECK: %[[#]] = OpSConvert [[U32]] %[[#]]
 ; CHECK: %[[#]] = OpFConvert [[F16]] %[[#]]
-; CHECK: %[[#]] = OpQuantizeToF16 [[F32]] %[[#]]
 ; CHECK: %[[#]] = OpSatConvertSToU [[U64]] %[[#]]
 ; CHECK: %[[#]] = OpSatConvertUToS [[U64]] %[[#]]
 ; CHECK: %[[#]] = OpConvertPtrToU [[U64]] [[Arg1]]
@@ -267,11 +266,11 @@ define <4 x i32>  @u16tou32v4(<4 x i16> %a) {
 ; CHECK: %[[#]] = OpSConvert [[U32v4]] %[[#]]
 ; CHECK: %[[#]] = OpConvertUToF [[F32]] %[[#]]
 ; CHECK: %[[#]] = OpConvertUToF [[F32]] %[[#]]
+; CHECK: %[[#R16]] = OpFConvert [[F32v2]] %[[#]]
 ; CHECK: %[[#R17]] = OpFConvert [[F32v2]] %[[#]]
 ; CHECK: %[[#R18]] = OpFConvert [[F32v2]] %[[#]]
 ; CHECK: %[[#R19]] = OpFConvert [[F32v2]] %[[#]]
-; CHECK: %[[#R20]] = OpFConvert [[F32v2]] %[[#]]
-; CHECK: %[[#R21]] = OpConvertFToU [[U8]] %[[#]]
+; CHECK: %[[#R20]] = OpConvertFToU [[U8]] %[[#]]
 ; CHECK: OpFunctionEnd
 define dso_local spir_kernel void @test_wrappers(ptr addrspace(4) %arg, i64 %arg_ptr, <4 x i8> %arg_v2) {
   %r1 = call spir_func i32 @__spirv_ConvertFToU(float 0.000000e+00)
@@ -281,20 +280,19 @@ define dso_local spir_kernel void @test_wrappers(ptr addrspace(4) %arg, i64 %arg
   %r5 = call spir_func i32 @__spirv_UConvert(i64 1)
   %r6 = call spir_func i32 @__spirv_SConvert(i64 1)
   %r7 = call spir_func half @__spirv_FConvert(float 0.000000e+00)
-  %r8 = call spir_func float @__spirv_QuantizeToF16(float 0.000000e+00)
-  %r9 = call spir_func i64 @__spirv_SatConvertSToU(i64 1)
-  %r10 = call spir_func i64 @__spirv_SatConvertUToS(i64 1)
-  %r11 = call spir_func i64 @__spirv_ConvertPtrToU(ptr addrspace(4) %arg)
-  %r12 = call spir_func ptr addrspace(4) @__spirv_ConvertUToPtr(i64 %arg_ptr)
-  %r13 = call spir_func <4 x i32> @_Z22__spirv_UConvert_Rint2Dv2_a(<4 x i8> %arg_v2)
-  %r14 = call spir_func <4 x i32> @_Z22__spirv_SConvert_Rint2Dv2_a(<4 x i8> %arg_v2)
-  %r15 = call spir_func float @_Z30__spirv_ConvertUToF_Rfloat_rtz(i64 %arg_ptr)
-  %r16 = call spir_func float @__spirv_ConvertUToF_Rfloat_rtz(i64 %arg_ptr)
-  %r17 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtzDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
-  %r18 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rteDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
-  %r19 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtpDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
-  %r20 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtnDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
-  %r21 = call spir_func i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float noundef 42.0)
+  %r8 = call spir_func i64 @__spirv_SatConvertSToU(i64 1)
+  %r9 = call spir_func i64 @__spirv_SatConvertUToS(i64 1)
+  %r10 = call spir_func i64 @__spirv_ConvertPtrToU(ptr addrspace(4) %arg)
+  %r11 = call spir_func ptr addrspace(4) @__spirv_ConvertUToPtr(i64 %arg_ptr)
+  %r12 = call spir_func <4 x i32> @_Z22__spirv_UConvert_Rint2Dv2_a(<4 x i8> %arg_v2)
+  %r13 = call spir_func <4 x i32> @_Z22__spirv_SConvert_Rint2Dv2_a(<4 x i8> %arg_v2)
+  %r14 = call spir_func float @_Z30__spirv_ConvertUToF_Rfloat_rtz(i64 %arg_ptr)
+  %r15 = call spir_func float @__spirv_ConvertUToF_Rfloat_rtz(i64 %arg_ptr)
+  %r16 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtzDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
+  %r17 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rteDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
+  %r18 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtpDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
+  %r19 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtnDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
+  %r20 = call spir_func i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float noundef 42.0)
   ret void
 }
 
@@ -305,7 +303,6 @@ declare dso_local spir_func float @__spirv_ConvertUToF(i32)
 declare dso_local spir_func i32 @__spirv_UConvert(i64)
 declare dso_local spir_func i32 @__spirv_SConvert(i64)
 declare dso_local spir_func half @__spirv_FConvert(float)
-declare dso_local spir_func float @__spirv_QuantizeToF16(float)
 declare dso_local spir_func i64 @__spirv_SatConvertSToU(i64)
 declare dso_local spir_func i64 @__spirv_SatConvertUToS(i64)
 declare dso_local spir_func i64 @__spirv_ConvertPtrToU(ptr addrspace(4))
diff --git a/llvm/test/CodeGen/SPIRV/instructions/quantizeto16.ll b/llvm/test/CodeGen/SPIRV/instructions/quantizeto16.ll
new file mode 100644
index 0000000..0b12ba4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/instructions/quantizeto16.ll
@@ -0,0 +1,15 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - -filetype=obj | spirv-val %}
+
+; TODO:  Implement support for the SPIR-V QuantizeToF16 operation
+; XFAIL: *
+
+; CHECK-DAG: [[F32:%.*]] = OpTypeFloat 32
+; CHECK: %[[#]] = OpQuantizeToF16 [[F32]] %[[#]]
+define spir_func void @test_wrappers() {
+  entry:
+  %r8 = call spir_func float @__spirv_QuantizeToF16(float 0.000000e+00)
+  ret void
+}
+
+declare dso_local spir_func float @__spirv_QuantizeToF16(float)
diff --git a/llvm/test/CodeGen/VE/Scalar/max.ll b/llvm/test/CodeGen/VE/Scalar/max.ll
index 51da557..7950842 100644
--- a/llvm/test/CodeGen/VE/Scalar/max.ll
+++ b/llvm/test/CodeGen/VE/Scalar/max.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
-; RUN: llc < %s -mtriple=ve-unknown-unknown -enable-no-signed-zeros-fp-math \
-; RUN:     -enable-no-nans-fp-math | FileCheck %s -check-prefix=OPT
 
 define double @maxf64(double, double) {
 ; CHECK-LABEL: maxf64:
@@ -10,16 +8,21 @@ define double @maxf64(double, double) {
 ; CHECK-NEXT:    cmov.d.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ogt double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @maxf64_fast(double, double) {
+; CHECK-LABEL: maxf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ogt double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define double @max2f64(double, double) {
 ; CHECK-LABEL: max2f64:
 ; CHECK:       # %bb.0:
@@ -27,16 +30,21 @@ define double @max2f64(double, double) {
 ; CHECK-NEXT:    cmov.d.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2f64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp oge double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @max2f64_fast(double, double) {
+; CHECK-LABEL: max2f64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp oge double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 ; VE has no max for unordered comparison
 define double @maxuf64(double, double) {
 ; CHECK-LABEL: maxuf64:
@@ -45,16 +53,21 @@ define double @maxuf64(double, double) {
 ; CHECK-NEXT:    cmov.d.gtnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxuf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ugt double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @maxuf64_fast(double, double) {
+; CHECK-LABEL: maxuf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ugt double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 ; VE has no max for unordered comparison
 define double @max2uf64(double, double) {
 ; CHECK-LABEL: max2uf64:
@@ -63,16 +76,21 @@ define double @max2uf64(double, double) {
 ; CHECK-NEXT:    cmov.d.genan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2uf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp uge double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @max2uf64_fast(double, double) {
+; CHECK-LABEL: max2uf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp uge double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define float @maxf32(float, float) {
 ; CHECK-LABEL: maxf32:
 ; CHECK:       # %bb.0:
@@ -80,16 +98,21 @@ define float @maxf32(float, float) {
 ; CHECK-NEXT:    cmov.s.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ogt float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @maxf32_fast(float, float) {
+; CHECK-LABEL: maxf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ogt float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @max2f32(float, float) {
 ; CHECK-LABEL: max2f32:
 ; CHECK:       # %bb.0:
@@ -97,16 +120,21 @@ define float @max2f32(float, float) {
 ; CHECK-NEXT:    cmov.s.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2f32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp oge float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @max2f32_fast(float, float) {
+; CHECK-LABEL: max2f32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp oge float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @maxuf32(float, float) {
 ; CHECK-LABEL: maxuf32:
 ; CHECK:       # %bb.0:
@@ -114,16 +142,21 @@ define float @maxuf32(float, float) {
 ; CHECK-NEXT:    cmov.s.gtnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxuf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ugt float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @maxuf32_fast(float, float) {
+; CHECK-LABEL: maxuf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ugt float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @max2uf32(float, float) {
 ; CHECK-LABEL: max2uf32:
 ; CHECK:       # %bb.0:
@@ -131,26 +164,26 @@ define float @max2uf32(float, float) {
 ; CHECK-NEXT:    cmov.s.genan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2uf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp uge float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @max2uf32_fast(float, float) {
+; CHECK-LABEL: max2uf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp uge float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define i64 @maxi64(i64, i64) {
 ; CHECK-LABEL: maxi64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxi64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    maxs.l %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sgt i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -161,11 +194,6 @@ define i64 @max2i64(i64, i64) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2i64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    maxs.l %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sge i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -178,13 +206,6 @@ define i64 @maxu64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxu64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.l %s2, %s0, %s1
-; OPT-NEXT:    cmov.l.gt %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ugt i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -197,13 +218,6 @@ define i64 @max2u64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2u64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.l %s2, %s0, %s1
-; OPT-NEXT:    cmov.l.ge %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp uge i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -214,11 +228,6 @@ define i32 @maxi32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxi32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    maxs.w.sx %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sgt i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -229,11 +238,6 @@ define i32 @max2i32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2i32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    maxs.w.sx %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sge i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -246,13 +250,6 @@ define i32 @maxu32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxu32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.w %s2, %s0, %s1
-; OPT-NEXT:    cmov.w.gt %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ugt i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -265,13 +262,6 @@ define i32 @max2u32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2u32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.w %s2, %s0, %s1
-; OPT-NEXT:    cmov.w.ge %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp uge i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -283,12 +273,6 @@ define zeroext i1 @maxi1(i1 zeroext, i1 zeroext) {
 ; CHECK-NEXT:    or %s0, %s0, %s1
 ; CHECK-NEXT:    and %s0, 1, %s0
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxi1:
-; OPT:       # %bb.0:
-; OPT-NEXT:    or %s0, %s0, %s1
-; OPT-NEXT:    and %s0, 1, %s0
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = xor i1 %1, true
   %4 = and i1 %3, %0
   %5 = select i1 %4, i1 %0, i1 %1
diff --git a/llvm/test/CodeGen/VE/Scalar/min.ll b/llvm/test/CodeGen/VE/Scalar/min.ll
index e8f4939..36a2e06 100644
--- a/llvm/test/CodeGen/VE/Scalar/min.ll
+++ b/llvm/test/CodeGen/VE/Scalar/min.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
-; RUN: llc < %s -mtriple=ve-unknown-unknown -enable-no-signed-zeros-fp-math \
-; RUN:     -enable-no-nans-fp-math | FileCheck %s -check-prefix=OPT
 
 define double @minf64(double, double) {
 ; CHECK-LABEL: minf64:
@@ -10,16 +8,21 @@ define double @minf64(double, double) {
 ; CHECK-NEXT:    cmov.d.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp olt double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @minf64_fast(double, double) {
+; CHECK-LABEL: minf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp olt double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define double @min2f64(double, double) {
 ; CHECK-LABEL: min2f64:
 ; CHECK:       # %bb.0:
@@ -27,16 +30,21 @@ define double @min2f64(double, double) {
 ; CHECK-NEXT:    cmov.d.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2f64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ole double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @min2f64_fast(double, double) {
+; CHECK-LABEL: min2f64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ole double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define double @minuf64(double, double) {
 ; CHECK-LABEL: minuf64:
 ; CHECK:       # %bb.0:
@@ -44,16 +52,21 @@ define double @minuf64(double, double) {
 ; CHECK-NEXT:    cmov.d.ltnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minuf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ult double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @minuf64_fast(double, double) {
+; CHECK-LABEL: minuf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ult double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define double @min2uf64(double, double) {
 ; CHECK-LABEL: min2uf64:
 ; CHECK:       # %bb.0:
@@ -61,16 +74,21 @@ define double @min2uf64(double, double) {
 ; CHECK-NEXT:    cmov.d.lenan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2uf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ule double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @min2uf64_fast(double, double) {
+; CHECK-LABEL: min2uf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ule double %0, %1
+  %4 = select  nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define float @minf32(float, float) {
 ; CHECK-LABEL: minf32:
 ; CHECK:       # %bb.0:
@@ -78,16 +96,21 @@ define float @minf32(float, float) {
 ; CHECK-NEXT:    cmov.s.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp olt float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @minf32_fast(float, float) {
+; CHECK-LABEL: minf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp olt float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @min2f32(float, float) {
 ; CHECK-LABEL: min2f32:
 ; CHECK:       # %bb.0:
@@ -95,16 +118,21 @@ define float @min2f32(float, float) {
 ; CHECK-NEXT:    cmov.s.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2f32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ole float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @min2f32_fast(float, float) {
+; CHECK-LABEL: min2f32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ole float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @minuf32(float, float) {
 ; CHECK-LABEL: minuf32:
 ; CHECK:       # %bb.0:
@@ -112,16 +140,21 @@ define float @minuf32(float, float) {
 ; CHECK-NEXT:    cmov.s.ltnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minuf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ult float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @minuf32_fast(float, float) {
+; CHECK-LABEL: minuf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ult float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @min2uf32(float, float) {
 ; CHECK-LABEL: min2uf32:
 ; CHECK:       # %bb.0:
@@ -129,26 +162,26 @@ define float @min2uf32(float, float) {
 ; CHECK-NEXT:    cmov.s.lenan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2uf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ule float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @min2uf32_fast(float, float) {
+; CHECK-LABEL: min2uf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ule float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define i64 @mini64(i64, i64) {
 ; CHECK-LABEL: mini64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: mini64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    mins.l %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp slt i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -159,11 +192,6 @@ define i64 @min2i64(i64, i64) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2i64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    mins.l %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sle i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -176,13 +204,6 @@ define i64 @minu64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minu64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.l %s2, %s0, %s1
-; OPT-NEXT:    cmov.l.lt %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ult i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -195,13 +216,6 @@ define i64 @min2u64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2u64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.l %s2, %s0, %s1
-; OPT-NEXT:    cmov.l.le %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ule i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -212,11 +226,6 @@ define i32 @mini32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: mini32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    mins.w.sx %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp slt i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -227,11 +236,6 @@ define i32 @min2i32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2i32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    mins.w.sx %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sle i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -244,13 +248,6 @@ define i32 @minu32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minu32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.w %s2, %s0, %s1
-; OPT-NEXT:    cmov.w.lt %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ult i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -263,13 +260,6 @@ define i32 @min2u32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2u32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.w %s2, %s0, %s1
-; OPT-NEXT:    cmov.w.le %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ule i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -283,14 +273,6 @@ define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
 ; CHECK-NEXT:    cmov.w.ne %s0, %s1, %s2
 ; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: mini1:
-; OPT:       # %bb.0:
-; OPT-NEXT:    and %s2, 1, %s0
-; OPT-NEXT:    and %s0, %s1, %s0
-; OPT-NEXT:    cmov.w.ne %s0, %s1, %s2
-; OPT-NEXT:    adds.w.zx %s0, %s0, (0)1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = xor i1 %0, true
   %4 = and i1 %3, %1
   %5 = select i1 %4, i1 %0, i1 %1
diff --git a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
index 47ea762..a599f46 100644
--- a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
+++ b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
@@ -19,11 +19,11 @@ define hidden i32 @accumulate_add_u8_u8(ptr noundef readonly  %a, ptr noundef re
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
-; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
 ; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
 
 entry:
   %cmp8.not = icmp eq i32 %N, 0
@@ -65,11 +65,11 @@ define hidden i32 @accumulate_add_s8_s8(ptr noundef readonly  %a, ptr noundef re
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
-; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -108,12 +108,11 @@ define hidden i32 @accumulate_add_s8_u8(ptr noundef readonly  %a, ptr noundef re
 
 ; MAX-BANDWIDTH: loop
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
-; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
-; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
 entry:
   %cmp8.not = icmp eq i32 %N, 0
@@ -363,10 +362,10 @@ define hidden i32 @accumulate_add_u16_u16(ptr noundef readonly  %a, ptr noundef
 ; MAX-BANDWIDTH: loop
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
-; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
 ; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -402,10 +401,10 @@ define hidden i32 @accumulate_add_s16_s16(ptr noundef readonly  %a, ptr noundef
 ; MAX-BANDWIDTH: loop
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
-; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
diff --git a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
new file mode 100644
index 0000000..76d84c1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=X64
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbssd_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbssd_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbssds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbssds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbsud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbsud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbsud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbsuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbsuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbsuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbuud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbuud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbuud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbuuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbuuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbuuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index 09eb53f..a2aad60 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -53,7 +53,7 @@ declare <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float>, <32 x half>, <32
 
 ; VNNI INT8
 
-define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpbssd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -64,12 +64,12 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpbssd (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_mask_dpbssds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -81,13 +81,13 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_maskz_dpbssd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -99,16 +99,16 @@ define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xc9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpbsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -119,12 +119,12 @@ define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpbsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_mask_dpbsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -136,13 +136,13 @@ define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_maskz_dpbsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -154,16 +154,16 @@ define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpbuud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -174,12 +174,12 @@ define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpbuud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_mask_dpbuuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -191,13 +191,13 @@ define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_maskz_dpbuud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -209,14 +209,14 @@ define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
 ; VNNI INT16
 
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 0c5fd3b..1f270d5 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -101,7 +101,7 @@ declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x
 
 ; VNNI INT8
 
-define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_mask_dpbssd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -113,13 +113,13 @@ define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_maskz_dpbssds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -131,13 +131,13 @@ define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_maskz_dpbssds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -149,13 +149,13 @@ define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_mask_dpbssd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -167,18 +167,18 @@ define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_mask_dpbsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -190,13 +190,13 @@ define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_maskz_dpbsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -208,13 +208,13 @@ define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_maskz_dpbsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -226,13 +226,13 @@ define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_mask_dpbsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -244,18 +244,18 @@ define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_mask_dpbuud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -267,13 +267,13 @@ define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_maskz_dpbuuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -285,13 +285,13 @@ define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_maskz_dpbuuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -303,13 +303,13 @@ define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_mask_dpbuud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -321,16 +321,16 @@ define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 ; VNNI INT16
 
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics-upgrade.ll
new file mode 100644
index 0000000..ce9a0fb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics-upgrade.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X64
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbuud(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuud:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuud:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
index 0ddd017..6c3d90aa 100644
--- a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
@@ -5,9 +5,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2  --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X64
 
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -41,16 +41,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -84,16 +84,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -127,16 +127,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -170,16 +170,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -213,16 +213,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -256,16 +256,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -299,16 +299,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -342,16 +342,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -385,16 +385,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -428,16 +428,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -471,16 +471,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -514,9 +514,9 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
index fd988f7..a49d3a5 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 < %s | FileCheck %s
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -24,11 +24,11 @@ define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssd_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -38,11 +38,11 @@ define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -52,11 +52,11 @@ define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssd_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -66,11 +66,11 @@ define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -80,11 +80,11 @@ define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -94,11 +94,11 @@ define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -108,11 +108,11 @@ define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -122,11 +122,11 @@ define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsud:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -136,11 +136,11 @@ define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsud_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -151,11 +151,11 @@ define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbsud %xmm1, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -165,11 +165,11 @@ define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsud_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -180,11 +180,11 @@ define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    vpdpbsud %ymm1, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsuds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -194,11 +194,11 @@ define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsuds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -209,11 +209,11 @@ define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbsuds %xmm1, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -223,11 +223,11 @@ define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsuds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -238,11 +238,11 @@ define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    vpdpbsuds %ymm1, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuud:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -252,11 +252,11 @@ define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuud_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -266,11 +266,11 @@ define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -280,11 +280,11 @@ define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuud_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -294,11 +294,11 @@ define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuuds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -308,11 +308,11 @@ define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuuds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -322,11 +322,11 @@ define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -336,11 +336,11 @@ define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuuds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -350,6 +350,6 @@ define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
index 93006ae..991467e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
@@ -124,11 +124,11 @@ define <16 x float> @test_mm512_maskz_dpph_ps(i16 zeroext %__U, <16 x float> %__
 declare <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float>, <32 x half>, <32 x half>)
 
 
-define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbssd_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -137,22 +137,18 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[TMP10:%.*]] = load <64 x i8>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <64 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = load <64 x i8>, ptr [[TMP8]], align 64
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <64 x i8> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <64 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <64 x i8> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i1> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <64 x i1> [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <64 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <64 x i1> [[TMP11]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <64 x i1> [[TMP14]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <64 x i1> [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <64 x i1> [[TMP20]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = sext <64 x i1> [[TMP21]] to <64 x i8>
@@ -160,34 +156,30 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP27:%.*]] = sext <16 x i1> [[TMP24]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP27]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[TMP10]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbssds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP24:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP28]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP28]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
@@ -195,7 +187,7 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP1]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -207,31 +199,27 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbssd_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP25:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP29]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
@@ -239,7 +227,7 @@ define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -251,21 +239,21 @@ define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbsud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -273,87 +261,123 @@ define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <64 x i8>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <64 x i8>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
+; CHECK-NEXT:    store <16 x i32> [[TMP22]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbsuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = and <64 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <64 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <64 x i1> [[TMP19]], [[TMP22]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i1> [[TMP23]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <64 x i1> [[TMP13]] to <64 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i8> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbsud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = and <64 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <64 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <64 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i1> [[TMP24]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <64 x i1> [[TMP13]] to <64 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i8> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbuud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -361,80 +385,116 @@ define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <64 x i8>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <64 x i8>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
+; CHECK-NEXT:    store <16 x i32> [[TMP22]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbuuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = and <64 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <64 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <64 x i1> [[TMP19]], [[TMP22]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i1> [[TMP23]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <64 x i1> [[TMP13]] to <64 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i8> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbuud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = and <64 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <64 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <64 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i1> [[TMP24]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <64 x i1> [[TMP13]] to <64 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i8> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
 
 define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
index e121c3b..373eff6 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
@@ -243,25 +243,21 @@ declare <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float>, <8 x half>, <8 x ha
 declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x half>)
 
 
-define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbssd_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i32> [[__A]] to <16 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[__B]] to <16 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP27]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP28]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP28]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i8>
@@ -269,7 +265,7 @@ define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i1> [[TMP20]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP23]], [[TMP1]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -281,31 +277,27 @@ define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbssds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[__A]] to <16 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[__B]] to <16 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <16 x i8> [[TMP27]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP25]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP29]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i8>
@@ -313,7 +305,7 @@ define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i1> [[TMP20]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -325,31 +317,27 @@ define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbssds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP24:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i32> [[__A]] to <32 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i32> [[__B]] to <32 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP27]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP28]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP28]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
@@ -357,7 +345,7 @@ define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <8 x i1> [[TMP20]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP23]], [[TMP1]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -369,31 +357,27 @@ define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbssd_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP25:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i32> [[__A]] to <32 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i32> [[__B]] to <32 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <32 x i8> [[TMP27]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP25]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP29]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
@@ -401,7 +385,7 @@ define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <8 x i1> [[TMP20]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -413,28 +397,40 @@ define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbsud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -446,23 +442,35 @@ define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbsuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -474,23 +482,35 @@ define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i8> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -502,23 +522,35 @@ define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbsud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i8> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -530,28 +562,40 @@ define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbuud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -563,23 +607,35 @@ define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbuuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -591,23 +647,35 @@ define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i8> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -619,23 +687,35 @@ define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbuud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i8> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -647,16 +727,16 @@ define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 
 define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
index 3df0f1d..d91abea 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
@@ -10,15 +10,15 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssd_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1:![0-9]+]]
@@ -26,22 +26,18 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP22]] to <16 x i8>
@@ -49,18 +45,14 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <4 x i32> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <4 x i1> [[TMP25]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP28]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[TMP30]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP26]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP27]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP26]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = or <16 x i1> [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = or <16 x i1> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sext <16 x i1> [[TMP43]] to <16 x i8>
@@ -68,28 +60,28 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = sext <4 x i1> [[TMP46]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP49]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssds_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -97,22 +89,18 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP22]] to <16 x i8>
@@ -120,18 +108,14 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <4 x i32> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <4 x i1> [[TMP25]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP28]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[TMP30]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP26]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP27]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP26]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = or <16 x i1> [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = or <16 x i1> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sext <16 x i1> [[TMP43]] to <16 x i8>
@@ -139,28 +123,28 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = sext <4 x i1> [[TMP46]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP49]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssd_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -168,22 +152,18 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[TMP30:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <32 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <32 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <32 x i1> [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
@@ -191,18 +171,14 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <8 x i32> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <8 x i1> [[TMP25]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP28]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i32> [[TMP4]] to <32 x i8>
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[TMP30]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP26]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP27]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP26]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = or <32 x i1> [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = or <32 x i1> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sext <32 x i1> [[TMP43]] to <32 x i8>
@@ -210,28 +186,28 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = sext <8 x i1> [[TMP46]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP49]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssds_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -239,22 +215,18 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[TMP30:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <32 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <32 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <32 x i1> [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
@@ -262,18 +234,14 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <8 x i32> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <8 x i1> [[TMP25]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP28]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i32> [[TMP4]] to <32 x i8>
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[TMP30]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP26]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP27]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP26]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = or <32 x i1> [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = or <32 x i1> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sext <32 x i1> [[TMP43]] to <32 x i8>
@@ -281,28 +249,28 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = sext <8 x i1> [[TMP46]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP49]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbsud_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -310,38 +278,62 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <16 x i1> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <16 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <16 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <16 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <16 x i1> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP34]] to <4 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <4 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <4 x i1> [[TMP36]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbsuds_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -349,38 +341,62 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <16 x i1> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <16 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <16 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <16 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <16 x i1> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP34]] to <4 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <4 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <4 x i1> [[TMP36]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbsud_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -388,38 +404,62 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i1> [[TMP18]] to <32 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <32 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <32 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <32 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <32 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <32 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <32 x i1> [[TMP33]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <32 x i8> [[TMP34]] to <8 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <8 x i1> [[TMP36]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbsuds_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -427,38 +467,62 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i1> [[TMP18]] to <32 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <32 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <32 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <32 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <32 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <32 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <32 x i1> [[TMP33]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <32 x i8> [[TMP34]] to <8 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <8 x i1> [[TMP36]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbuud_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -466,38 +530,62 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <16 x i1> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <16 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <16 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <16 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <16 x i1> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP34]] to <4 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <4 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <4 x i1> [[TMP36]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbuuds_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -505,38 +593,62 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <16 x i1> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <16 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <16 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <16 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <16 x i1> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP34]] to <4 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <4 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <4 x i1> [[TMP36]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbuud_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -544,38 +656,62 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i1> [[TMP18]] to <32 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <32 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <32 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <32 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <32 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <32 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <32 x i1> [[TMP33]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <32 x i8> [[TMP34]] to <8 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <8 x i1> [[TMP36]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbuuds_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -583,25 +719,49 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i1> [[TMP18]] to <32 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <32 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <32 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <32 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <32 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <32 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <32 x i1> [[TMP33]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <32 x i8> [[TMP34]] to <8 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <8 x i1> [[TMP36]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
index 236b019..ddebe3e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
@@ -1,89 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK %s --allow-empty
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK-ORIGIN %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define [2 x i32] @InsertValue(i32 %x, i32 %y) sanitize_memory {
+; CHECK-LABEL: define [2 x i32] @InsertValue(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x i32] [i32 -1, i32 -1], i32 [[TMP0]], 0
+; CHECK-NEXT:    [[A:%.*]] = insertvalue [2 x i32] undef, i32 [[X]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [2 x i32] [[TMP2]], i32 [[TMP1]], 1
+; CHECK-NEXT:    [[B:%.*]] = insertvalue [2 x i32] [[A]], i32 [[Y]], 1
+; CHECK-NEXT:    store [2 x i32] [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [2 x i32] [[B]]
+;
+; CHECK-ORIGIN-LABEL: define [2 x i32] @InsertValue(
+; CHECK-ORIGIN-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-ORIGIN-NEXT:  [[ENTRY:.*:]]
+; CHECK-ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP4:%.*]] = insertvalue [2 x i32] [i32 -1, i32 -1], i32 [[TMP0]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP1]], i32 0
+; CHECK-ORIGIN-NEXT:    [[A:%.*]] = insertvalue [2 x i32] undef, i32 [[X]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP7:%.*]] = insertvalue [2 x i32] [[TMP4]], i32 [[TMP2]], 1
+; CHECK-ORIGIN-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP3]], i32 [[TMP6]]
+; CHECK-ORIGIN-NEXT:    [[B:%.*]] = insertvalue [2 x i32] [[A]], i32 [[Y]], 1
+; CHECK-ORIGIN-NEXT:    store [2 x i32] [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP9]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret [2 x i32] [[B]]
+;
 entry:
   %a = insertvalue [2 x i32] undef, i32 %x, 0
   %b = insertvalue [2 x i32] %a, i32 %y, 1
   ret [2 x i32] %b
 }
 
-; CHECK-LABEL: @InsertValue(
-; CHECK-DAG: [[Sx:%.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: [[Sy:%.*]] = load i32, ptr {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK: [[A:%.*]] = insertvalue [2 x i32] [i32 -1, i32 -1], i32 [[Sx]], 0
-; CHECK: [[B:%.*]] = insertvalue [2 x i32] [[A]], i32 [[Sy]], 1
-; CHECK: store [2 x i32] [[B]], ptr {{.*}}@__msan_retval_tls
-; CHECK: ret [2 x i32]
-
-
 define [2 x double] @InsertValueDouble(double %x, double %y) sanitize_memory {
+; CHECK-LABEL: define [2 x double] @InsertValueDouble(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x i64] [i64 -1, i64 -1], i64 [[TMP0]], 0
+; CHECK-NEXT:    [[A:%.*]] = insertvalue [2 x double] undef, double [[X]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [2 x i64] [[TMP2]], i64 [[TMP1]], 1
+; CHECK-NEXT:    [[B:%.*]] = insertvalue [2 x double] [[A]], double [[Y]], 1
+; CHECK-NEXT:    store [2 x i64] [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [2 x double] [[B]]
+;
+; CHECK-ORIGIN-LABEL: define [2 x double] @InsertValueDouble(
+; CHECK-ORIGIN-SAME: double [[X:%.*]], double [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:  [[ENTRY:.*:]]
+; CHECK-ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP4:%.*]] = insertvalue [2 x i64] [i64 -1, i64 -1], i64 [[TMP0]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP1]], i32 0
+; CHECK-ORIGIN-NEXT:    [[A:%.*]] = insertvalue [2 x double] undef, double [[X]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP7:%.*]] = insertvalue [2 x i64] [[TMP4]], i64 [[TMP2]], 1
+; CHECK-ORIGIN-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP3]], i32 [[TMP6]]
+; CHECK-ORIGIN-NEXT:    [[B:%.*]] = insertvalue [2 x double] [[A]], double [[Y]], 1
+; CHECK-ORIGIN-NEXT:    store [2 x i64] [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP9]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret [2 x double] [[B]]
+;
 entry:
   %a = insertvalue [2 x double] undef, double %x, 0
   %b = insertvalue [2 x double] %a, double %y, 1
   ret [2 x double] %b
 }
 
-; CHECK-LABEL: @InsertValueDouble(
-; CHECK-DAG: [[Sx:%.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: [[Sy:%.*]] = load i64, ptr {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK: [[A:%.*]] = insertvalue [2 x i64] [i64 -1, i64 -1], i64 [[Sx]], 0
-; CHECK: [[B:%.*]] = insertvalue [2 x i64] [[A]], i64 [[Sy]], 1
-; CHECK: store [2 x i64] [[B]], ptr {{.*}}@__msan_retval_tls
-; CHECK: ret [2 x double]
-
-
 define i32 @ExtractValue([2 x i32] %a) sanitize_memory {
+; CHECK-LABEL: define i32 @ExtractValue(
+; CHECK-SAME: [2 x i32] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i32], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue [2 x i32] [[TMP0]], 1
+; CHECK-NEXT:    [[X:%.*]] = extractvalue [2 x i32] [[A]], 1
+; CHECK-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[X]]
+;
+; CHECK-ORIGIN-LABEL: define i32 @ExtractValue(
+; CHECK-ORIGIN-SAME: [2 x i32] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:  [[ENTRY:.*:]]
+; CHECK-ORIGIN-NEXT:    [[TMP0:%.*]] = load [2 x i32], ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = extractvalue [2 x i32] [[TMP0]], 1
+; CHECK-ORIGIN-NEXT:    [[X:%.*]] = extractvalue [2 x i32] [[A]], 1
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret i32 [[X]]
+;
 entry:
   %x = extractvalue [2 x i32] %a, 1
   ret i32 %x
 }
 
-; CHECK-LABEL: @ExtractValue(
-; CHECK: [[Sa:%.*]] = load [2 x i32], ptr @__msan_param_tls
-; CHECK: [[Sx:%.*]] = extractvalue [2 x i32] [[Sa]], 1
-; CHECK: store i32 [[Sx]], ptr @__msan_retval_tls
-; CHECK: ret i32
-
-
 ; Regression test for PR20493.
 
 %MyStruct = type { i32, i32, [3 x i32] }
 
 define i32 @ArrayInStruct(%MyStruct %s) sanitize_memory {
+; CHECK-LABEL: define i32 @ArrayInStruct(
+; CHECK-SAME: [[MYSTRUCT:%.*]] [[S:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load { i32, i32, [3 x i32] }, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i32, [3 x i32] } [[TMP1]], 2, 1
+; CHECK-NEXT:    [[X:%.*]] = extractvalue [[MYSTRUCT]] [[S]], 2, 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[X]]
+;
+; CHECK-ORIGIN-LABEL: define i32 @ArrayInStruct(
+; CHECK-ORIGIN-SAME: [[MYSTRUCT:%.*]] [[S:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load { i32, i32, [3 x i32] }, ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32, [3 x i32] } [[TMP1]], 2, 1
+; CHECK-ORIGIN-NEXT:    [[X:%.*]] = extractvalue [[MYSTRUCT]] [[S]], 2, 1
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret i32 [[X]]
+;
   %x = extractvalue %MyStruct %s, 2, 1
   ret i32 %x
 }
 
-; CHECK-LABEL: @ArrayInStruct(
-; CHECK: [[Ss:%.*]] = load { i32, i32, [3 x i32] }, ptr @__msan_param_tls
-; CHECK: [[Sx:%.*]] = extractvalue { i32, i32, [3 x i32] } [[Ss]], 2, 1
-; CHECK: store i32 [[Sx]], ptr @__msan_retval_tls
-; CHECK: ret i32
-
-
 define i32 @ArrayOfStructs([3 x { i32, i32 }] %a) sanitize_memory {
+; CHECK-LABEL: define i32 @ArrayOfStructs(
+; CHECK-SAME: [3 x { i32, i32 }] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load [3 x { i32, i32 }], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue [3 x { i32, i32 }] [[TMP1]], 2, 1
+; CHECK-NEXT:    [[X:%.*]] = extractvalue [3 x { i32, i32 }] [[A]], 2, 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[X]]
+;
+; CHECK-ORIGIN-LABEL: define i32 @ArrayOfStructs(
+; CHECK-ORIGIN-SAME: [3 x { i32, i32 }] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load [3 x { i32, i32 }], ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = extractvalue [3 x { i32, i32 }] [[TMP1]], 2, 1
+; CHECK-ORIGIN-NEXT:    [[X:%.*]] = extractvalue [3 x { i32, i32 }] [[A]], 2, 1
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret i32 [[X]]
+;
   %x = extractvalue [3 x { i32, i32 }] %a, 2, 1
   ret i32 %x
 }
 
-; CHECK-LABEL: @ArrayOfStructs(
-; CHECK: [[Ss:%.*]] = load [3 x { i32, i32 }], ptr @__msan_param_tls
-; CHECK: [[Sx:%.*]] = extractvalue [3 x { i32, i32 }] [[Ss]], 2, 1
-; CHECK: store i32 [[Sx]], ptr @__msan_retval_tls
-; CHECK: ret i32
-
-
 define <8 x i16> @ArrayOfVectors([3 x <8 x i16>] %a) sanitize_memory {
+; CHECK-LABEL: define <8 x i16> @ArrayOfVectors(
+; CHECK-SAME: [3 x <8 x i16>] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load [3 x <8 x i16>], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue [3 x <8 x i16>] [[TMP1]], 1
+; CHECK-NEXT:    [[X:%.*]] = extractvalue [3 x <8 x i16>] [[A]], 1
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[X]]
+;
+; CHECK-ORIGIN-LABEL: define <8 x i16> @ArrayOfVectors(
+; CHECK-ORIGIN-SAME: [3 x <8 x i16>] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load [3 x <8 x i16>], ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = extractvalue [3 x <8 x i16>] [[TMP1]], 1
+; CHECK-ORIGIN-NEXT:    [[X:%.*]] = extractvalue [3 x <8 x i16>] [[A]], 1
+; CHECK-ORIGIN-NEXT:    store <8 x i16> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret <8 x i16> [[X]]
+;
   %x = extractvalue [3 x <8 x i16>] %a, 1
   ret <8 x i16> %x
 }
 
-; CHECK-LABEL: @ArrayOfVectors(
-; CHECK: [[Ss:%.*]] = load [3 x <8 x i16>], ptr @__msan_param_tls
-; CHECK: [[Sx:%.*]] = extractvalue [3 x <8 x i16>] [[Ss]], 1
-; CHECK: store <8 x i16> [[Sx]], ptr @__msan_retval_tls
-; CHECK: ret <8 x i16>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
index 2f60bd8..f0f67fc 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 ; REQUIRES: x86-registered-target
 
@@ -15,131 +16,171 @@ declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
 declare i64 @llvm.x86.bmi.pext.64(i64, i64)
 
 define i32 @Test_bzhi_32(i32 %a, i32 %b) sanitize_memory {
+; CHECK-LABEL: define i32 @Test_bzhi_32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.bzhi.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.bzhi.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[C]]
+;
 entry:
   %c = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a, i32 %b)
   ret i32 %c
 }
 
-; CHECK-LABEL: @Test_bzhi_32(
-; CHECK-DAG: %[[SA:.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i32, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i32 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i32
-; CHECK-DAG: %[[X:.*]] = call i32 @llvm.x86.bmi.bzhi.32(i32 %[[SA]], i32 %b)
-; CHECK-DAG: %[[S:.*]] = or i32 %[[SB1]], %[[X]]
-; CHECK-DAG: store i32 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i32
 
 define i64 @Test_bzhi_64(i64 %a, i64 %b) sanitize_memory {
+; CHECK-LABEL: define i64 @Test_bzhi_64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.bzhi.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.bzhi.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[C]]
+;
 entry:
   %c = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a, i64 %b)
   ret i64 %c
 }
 
-; CHECK-LABEL: @Test_bzhi_64(
-; CHECK-DAG: %[[SA:.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i64, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i64 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i64
-; CHECK-DAG: %[[X:.*]] = call i64 @llvm.x86.bmi.bzhi.64(i64 %[[SA]], i64 %b)
-; CHECK-DAG: %[[S:.*]] = or i64 %[[SB1]], %[[X]]
-; CHECK-DAG: store i64 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i64
 
 
 define i32 @Test_bextr_32(i32 %a, i32 %b) sanitize_memory {
+; CHECK-LABEL: define i32 @Test_bextr_32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.bextr.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.bextr.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[C]]
+;
 entry:
   %c = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a, i32 %b)
   ret i32 %c
 }
 
-; CHECK-LABEL: @Test_bextr_32(
-; CHECK-DAG: %[[SA:.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i32, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i32 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i32
-; CHECK-DAG: %[[X:.*]] = call i32 @llvm.x86.bmi.bextr.32(i32 %[[SA]], i32 %b)
-; CHECK-DAG: %[[S:.*]] = or i32 %[[SB1]], %[[X]]
-; CHECK-DAG: store i32 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i32
 
 define i64 @Test_bextr_64(i64 %a, i64 %b) sanitize_memory {
+; CHECK-LABEL: define i64 @Test_bextr_64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.bextr.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.bextr.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[C]]
+;
 entry:
   %c = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a, i64 %b)
   ret i64 %c
 }
 
-; CHECK-LABEL: @Test_bextr_64(
-; CHECK-DAG: %[[SA:.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i64, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i64 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i64
-; CHECK-DAG: %[[X:.*]] = call i64 @llvm.x86.bmi.bextr.64(i64 %[[SA]], i64 %b)
-; CHECK-DAG: %[[S:.*]] = or i64 %[[SB1]], %[[X]]
-; CHECK-DAG: store i64 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i64
 
 
 define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory {
+; CHECK-LABEL: define i32 @Test_pdep_32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pdep.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.pdep.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[C]]
+;
 entry:
   %c = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a, i32 %b)
   ret i32 %c
 }
 
-; CHECK-LABEL: @Test_pdep_32(
-; CHECK-DAG: %[[SA:.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i32, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i32 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i32
-; CHECK-DAG: %[[X:.*]] = call i32 @llvm.x86.bmi.pdep.32(i32 %[[SA]], i32 %b)
-; CHECK-DAG: %[[S:.*]] = or i32 %[[SB1]], %[[X]]
-; CHECK-DAG: store i32 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i32
 
 define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory {
+; CHECK-LABEL: define i64 @Test_pdep_64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pdep.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.pdep.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[C]]
+;
 entry:
   %c = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a, i64 %b)
   ret i64 %c
 }
 
-; CHECK-LABEL: @Test_pdep_64(
-; CHECK-DAG: %[[SA:.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i64, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i64 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i64
-; CHECK-DAG: %[[X:.*]] = call i64 @llvm.x86.bmi.pdep.64(i64 %[[SA]], i64 %b)
-; CHECK-DAG: %[[S:.*]] = or i64 %[[SB1]], %[[X]]
-; CHECK-DAG: store i64 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i64
 
 define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory {
+; CHECK-LABEL: define i32 @Test_pext_32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pext.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.pext.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[C]]
+;
 entry:
   %c = tail call i32 @llvm.x86.bmi.pext.32(i32 %a, i32 %b)
   ret i32 %c
 }
 
-; CHECK-LABEL: @Test_pext_32(
-; CHECK-DAG: %[[SA:.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i32, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i32 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i32
-; CHECK-DAG: %[[X:.*]] = call i32 @llvm.x86.bmi.pext.32(i32 %[[SA]], i32 %b)
-; CHECK-DAG: %[[S:.*]] = or i32 %[[SB1]], %[[X]]
-; CHECK-DAG: store i32 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i32
 
 define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory {
+; CHECK-LABEL: define i64 @Test_pext_64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pext.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.pext.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[C]]
+;
 entry:
   %c = tail call i64 @llvm.x86.bmi.pext.64(i64 %a, i64 %b)
   ret i64 %c
 }
 
-; CHECK-LABEL: @Test_pext_64(
-; CHECK-DAG: %[[SA:.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i64, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i64 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i64
-; CHECK-DAG: %[[X:.*]] = call i64 @llvm.x86.bmi.pext.64(i64 %[[SA]], i64 %b)
-; CHECK-DAG: %[[S:.*]] = or i64 %[[SB1]], %[[X]]
-; CHECK-DAG: store i64 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i64
diff --git a/llvm/test/Instrumentation/MemorySanitizer/byval.ll b/llvm/test/Instrumentation/MemorySanitizer/byval.ll
index 258cec86..69970896 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/byval.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/byval.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -S -passes="msan<track-origins=1>" 2>&1 | FileCheck %s --implicit-check-not "call void @llvm.mem" --implicit-check-not " load" --implicit-check-not " store"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -S -passes="msan<track-origins=1>" 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -7,16 +8,28 @@ declare void @FnByVal(ptr byval(i128) %p);
 declare void @Fn(ptr %p);
 
 define i128 @ByValArgument(i32, ptr byval(i128) %p) sanitize_memory {
-; CHECK-LABEL: @ByValArgument(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[#]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         [[X:%.*]] = load i128, ptr %p, align 8
-; CHECK:         [[_MSLD:%.*]] = load i128, ptr %[[#]], align 8
-; CHECK:         %[[#]] = load i32, ptr %[[#]], align 8
-; CHECK:         store i128 [[_MSLD]], ptr @__msan_retval_tls, align 8
-; CHECK:         store i32 %[[#]], ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret i128 [[X]]
+; CHECK-LABEL: define i128 @ByValArgument(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = load i128, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i128, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 8
+; CHECK-NEXT:    store i128 [[_MSLD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP11]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i128 [[X]]
 ;
 entry:
   %x = load i128, ptr %p
@@ -24,13 +37,20 @@ entry:
 }
 
 define i128 @ByValArgumentNoSanitize(i32, ptr byval(i128) %p) {
-; CHECK-LABEL: @ByValArgumentNoSanitize(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 8 %[[#]], i8 0, i64 16, i1 false)
-; CHECK:         [[X:%.*]] = load i128, ptr %p, align 8
-; CHECK:         store i128 0, ptr @__msan_retval_tls, align 8
-; CHECK:         store i32 0, ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret i128 [[X]]
+; CHECK-LABEL: define i128 @ByValArgumentNoSanitize(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = load i128, ptr [[P]], align 8
+; CHECK-NEXT:    store i128 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i128 [[X]]
 ;
 entry:
   %x = load i128, ptr %p
@@ -38,13 +58,20 @@ entry:
 }
 
 define void @ByValForward(i32, ptr byval(i128) %p) sanitize_memory {
-; CHECK-LABEL: @ByValForward(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[#]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         store i64 0, ptr @__msan_param_tls, align 8
-; CHECK:         call void @Fn(ptr %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForward(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @Fn(ptr [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @Fn(ptr %p)
@@ -52,12 +79,19 @@ entry:
 }
 
 define void @ByValForwardNoSanitize(i32, ptr byval(i128) %p) {
-; CHECK-LABEL: @ByValForwardNoSanitize(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 8 %[[#]], i8 0, i64 16, i1 false)
-; CHECK:         store i64 0, ptr @__msan_param_tls, align 8
-; CHECK:         call void @Fn(ptr %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardNoSanitize(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @Fn(ptr [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @Fn(ptr %p)
@@ -65,14 +99,27 @@ entry:
 }
 
 define void @ByValForwardByVal(i32, ptr byval(i128) %p) sanitize_memory {
-; CHECK-LABEL: @ByValForwardByVal(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[#]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr @__msan_param_tls, ptr %[[#]], i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 @__msan_param_origin_tls, ptr align 4 %[[#]], i64 16, i1 false)
-; CHECK:         call void @FnByVal(ptr byval(i128) %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardByVal(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], -4
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr @__msan_param_tls, ptr [[TMP8]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 @__msan_param_origin_tls, ptr align 4 [[TMP11]], i64 16, i1 false)
+; CHECK-NEXT:    call void @FnByVal(ptr byval(i128) [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @FnByVal(ptr byval(i128) %p)
@@ -80,12 +127,25 @@ entry:
 }
 
 define void @ByValForwardByValNoSanitize(i32, ptr byval(i128) %p) {
-; CHECK-LABEL: @ByValForwardByValNoSanitize(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 8 %[[#]], i8 0, i64 16, i1 false)
-; CHECK:         call void @llvm.memset.p0.i64(ptr @__msan_param_tls, i8 0, i64 16, i1 false)
-; CHECK:         call void @FnByVal(ptr byval(i128) %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardByValNoSanitize(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], -4
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr @__msan_param_tls, i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @FnByVal(ptr byval(i128) [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @FnByVal(ptr byval(i128) %p)
@@ -96,16 +156,30 @@ declare void @FnByVal8(ptr byval(i8) %p);
 declare void @Fn8(ptr %p);
 
 define i8 @ByValArgument8(i32, ptr byval(i8) %p) sanitize_memory {
-; CHECK-LABEL: @ByValArgument8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[#]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
-; CHECK:         [[X:%.*]] = load i8, ptr %p, align 1
-; CHECK:         [[_MSLD:%.*]] = load i8, ptr %[[#]], align 1
-; CHECK:         %[[#]] = load i32, ptr %[[#]], align 4
-; CHECK:         store i8 [[_MSLD]], ptr @__msan_retval_tls, align 8
-; CHECK:         store i32 %[[#]], ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret i8 [[X]]
+; CHECK-LABEL: define i8 @ByValArgument8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT:    store i8 [[_MSLD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP13]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i8 [[X]]
 ;
 entry:
   %x = load i8, ptr %p
@@ -113,13 +187,21 @@ entry:
 }
 
 define i8 @ByValArgumentNoSanitize8(i32, ptr byval(i8) %p) {
-; CHECK-LABEL: @ByValArgumentNoSanitize8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 1 %[[#]], i8 0, i64 1, i1 false)
-; CHECK:         [[X:%.*]] = load i8, ptr %p, align 1
-; CHECK:         store i8 0, ptr @__msan_retval_tls, align 8
-; CHECK:         store i32 0, ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret i8 [[X]]
+; CHECK-LABEL: define i8 @ByValArgumentNoSanitize8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP3]], i8 0, i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i8 [[X]]
 ;
 entry:
   %x = load i8, ptr %p
@@ -127,13 +209,21 @@ entry:
 }
 
 define void @ByValForward8(i32, ptr byval(i8) %p) sanitize_memory {
-; CHECK-LABEL: @ByValForward8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[#]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
-; CHECK:         store i64 0, ptr @__msan_param_tls, align 8
-; CHECK:         call void @Fn8(ptr %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForward8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @Fn8(ptr [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @Fn8(ptr %p)
@@ -141,12 +231,20 @@ entry:
 }
 
 define void @ByValForwardNoSanitize8(i32, ptr byval(i8) %p) {
-; CHECK-LABEL: @ByValForwardNoSanitize8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 1 %[[#]], i8 0, i64 1, i1 false)
-; CHECK:         store i64 0, ptr @__msan_param_tls, align 8
-; CHECK:         call void @Fn8(ptr %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardNoSanitize8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP3]], i8 0, i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @Fn8(ptr [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @Fn8(ptr %p)
@@ -154,14 +252,28 @@ entry:
 }
 
 define void @ByValForwardByVal8(i32, ptr byval(i8) %p) sanitize_memory {
-; CHECK-LABEL: @ByValForwardByVal8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[#]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr @__msan_param_tls, ptr %[[#]], i64 1, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 @__msan_param_origin_tls, ptr align 4 %[[#]], i64 4, i1 false)
-; CHECK:         call void @FnByVal8(ptr byval(i8) %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardByVal8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr @__msan_param_tls, ptr [[TMP9]], i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 @__msan_param_origin_tls, ptr align 4 [[TMP12]], i64 4, i1 false)
+; CHECK-NEXT:    call void @FnByVal8(ptr byval(i8) [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @FnByVal8(ptr byval(i8) %p)
@@ -169,12 +281,26 @@ entry:
 }
 
 define void @ByValForwardByValNoSanitize8(i32, ptr byval(i8) %p) {
-; CHECK-LABEL: @ByValForwardByValNoSanitize8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 1 %[[#]], i8 0, i64 1, i1 false)
-; CHECK:         call void @llvm.memset.p0.i64(ptr @__msan_param_tls, i8 0, i64 1, i1 false)
-; CHECK:         call void @FnByVal8(ptr byval(i8) %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardByValNoSanitize8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP3]], i8 0, i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr @__msan_param_tls, i8 0, i64 1, i1 false)
+; CHECK-NEXT:    call void @FnByVal8(ptr byval(i8) [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @FnByVal8(ptr byval(i8) %p)
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index a34e7f5..cdfc8ce 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -273,3 +273,66 @@
 
 #CHECK: xvmulhuh  4, 5, 7
 0xf0,0x85,0x3b,0xd0
+
+#CHECK: xxmulmul 8, 3, 4, 2
+0xed,0x03,0x22,0x08
+
+#CHECK: xxmulmulhiadd 8, 3, 4, 1, 0, 1
+0xed,0x03,0x25,0x48
+
+#CHECK: xxmulmulloadd 8, 3, 4, 1, 0
+0xed,0x03,0x22,0x88
+
+#CHECK: xxssumudm 8, 3, 4, 1
+0xed,0x03,0x24,0xc8
+
+#CHECK: xxssumudmc 8, 3, 4, 1
+0xed,0x03,0x25,0xc8
+
+#CHECK: xxssumudmcext 8, 3, 4, 6, 0
+0x05,0x00,0x00,0x00,0x89,0x03,0x21,0xa0
+
+#CHECK: xsaddadduqm  4, 5, 7
+0xec,0x85,0x3b,0x00
+
+#CHECK: xsaddaddsuqm  4, 5, 7
+0xec,0x85,0x3b,0x40
+
+#CHECK: xsaddsubuqm  4, 5, 7
+0xec,0x85,0x3b,0x80
+
+#CHECK: xsaddsubsuqm  4, 5, 7
+0xec,0x85,0x3f,0x00
+
+#CHECK: xsrebase2t1uqm 4, 5, 7
+0xec,0x85,0x3c,0x88
+
+#CHECK: xsrebase2t2uqm 4, 5, 7
+0xec,0x85,0x3d,0x88
+
+#CHECK: xsrebase2t3uqm 4, 5, 7
+0xec,0x85,0x3e,0x88
+
+#CHECK: xsrebase2t4uqm 4, 5, 7
+0xec,0x85,0x3e,0xc8
+
+#CHECK: xsrebase3t1uqm 4, 5, 7
+0xec,0x85,0x3f,0x88
+
+#CHECK: xsrebase3t2uqm 4, 5, 7
+0xec,0x85,0x3f,0xc8
+
+#CHECK: xsrebase3t3uqm 4, 5, 7
+0xec,0x85,0x3e,0x18
+
+#CHECK: xsmerge2t1uqm 4, 5, 7
+0xec,0x85,0x3f,0x40
+
+#CHECK: xsmerge2t2uqm 4, 5, 7
+0xec,0x85,0x3f,0x80
+
+#CHECK: xsmerge2t3uqm 4, 5, 7
+0xec,0x85,0x3a,0xc8
+
+#CHECK: xsmerge3t1uqm 4, 5, 7
+0xec,0x85,0x3b,0xc8
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index 9cefe24..f7e314f 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -267,3 +267,66 @@
 
 #CHECK: xvmulhuh  4, 5, 7
 0xd0,0x3b,0x85,0xf0
+
+#CHECK: xxmulmul 8, 3, 4, 2
+0x08,0x22,0x03,0xed
+
+#CHECK: xxmulmulhiadd 8, 3, 4, 1, 0, 1
+0x48,0x25,0x03,0xed
+
+#CHECK: xxmulmulloadd 8, 3, 4, 1, 0
+0x88,0x22,0x03,0xed
+
+#CHECK: xxssumudm 8, 3, 4, 1
+0xc8,0x24,0x03,0xed
+
+#CHECK: xxssumudmc 8, 3, 4, 1
+0xc8,0x25,0x03,0xed
+
+#CHECK: xxssumudmcext 8, 3, 4, 6, 0
+0x00,0x00,0x00,0x05,0xa0,0x21,0x03,0x89
+
+#CHECK: xsaddadduqm  4, 5, 7
+0x00,0x3b,0x85,0xec
+
+#CHECK: xsaddaddsuqm  4, 5, 7
+0x40,0x3b,0x85,0xec
+
+#CHECK: xsaddsubuqm  4, 5, 7
+0x80,0x3b,0x85,0xec
+
+#CHECK: xsaddsubsuqm  4, 5, 7
+0x00,0x3f,0x85,0xec
+
+#CHECK: xsrebase2t1uqm 4, 5, 7
+0x88,0x3c,0x85,0xec
+
+#CHECK: xsrebase2t2uqm 4, 5, 7
+0x88,0x3d,0x85,0xec
+
+#CHECK: xsrebase2t3uqm 4, 5, 7
+0x88,0x3e,0x85,0xec
+
+#CHECK: xsrebase2t4uqm 4, 5, 7
+0xc8,0x3e,0x85,0xec
+
+#CHECK: xsrebase3t1uqm 4, 5, 7
+0x88,0x3f,0x85,0xec
+
+#CHECK: xsrebase3t2uqm 4, 5, 7
+0xc8,0x3f,0x85,0xec
+
+#CHECK: xsrebase3t3uqm 4, 5, 7
+0x18,0x3e,0x85,0xec
+
+#CHECK: xsmerge2t1uqm 4, 5, 7
+0x40,0x3f,0x85,0xec
+
+#CHECK: xsmerge2t2uqm 4, 5, 7
+0x80,0x3f,0x85,0xec
+
+#CHECK: xsmerge2t3uqm 4, 5, 7
+0xc8,0x3a,0x85,0xec
+
+#CHECK: xsmerge3t1uqm 4, 5, 7
+0xc8,0x3b,0x85,0xec
diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s
index a732988..8022d5b 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-la.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-la.s
@@ -26,6 +26,7 @@ la.abs $a0, sym_abs
 # ABS-NEXT:   lu32i.d $a0, %abs64_lo20(sym_abs)
 # ABS-NEXT:   lu52i.d $a0, $a0, %abs64_hi12(sym_abs)
 # ABS-EMPTY:
+# RELOC-NEXT: R_LARCH_MARK_LA - 0x0
 # RELOC-NEXT: R_LARCH_ABS_HI20 sym_abs 0x0
 # RELOC-NEXT: R_LARCH_ABS_LO12 sym_abs 0x0
 # RELOC-NEXT: R_LARCH_ABS64_LO20 sym_abs 0x0
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index f01d6fa..29fedd7 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -386,3 +386,89 @@
            xvmulhuh 4, 5, 7
 #CHECK-BE: xvmulhuh 4, 5, 7              # encoding: [0xf0,0x85,0x3b,0xd0]
 #CHECK-LE: xvmulhuh 4, 5, 7              # encoding: [0xd0,0x3b,0x85,0xf0]
+
+           xxmulmul 8, 3, 4, 2
+#CHECK-BE: xxmulmul 8, 3, 4, 2          # encoding: [0xed,0x03,0x22,0x08]
+#CHECK-LE: xxmulmul 8, 3, 4, 2          # encoding: [0x08,0x22,0x03,0xed]
+
+           xxmulmulhiadd 8, 3, 4, 1, 0, 1
+#CHECK-BE: xxmulmulhiadd 8, 3, 4, 1, 0, 1   # encoding: [0xed,0x03,0x25,0x48]
+#CHECK-LE: xxmulmulhiadd 8, 3, 4, 1, 0, 1   # encoding: [0x48,0x25,0x03,0xed]
+
+           xxmulmulloadd 8, 3, 4, 1, 0
+#CHECK-BE: xxmulmulloadd 8, 3, 4, 1, 0      # encoding: [0xed,0x03,0x22,0x88]
+#CHECK-LE: xxmulmulloadd 8, 3, 4, 1, 0      # encoding: [0x88,0x22,0x03,0xed]
+
+           xxssumudm 8, 3, 4, 1
+#CHECK-BE: xxssumudm 8, 3, 4, 1         # encoding: [0xed,0x03,0x24,0xc8]
+#CHECK-LE: xxssumudm 8, 3, 4, 1         # encoding: [0xc8,0x24,0x03,0xed]
+
+           xxssumudmc 8, 3, 4, 1
+#CHECK-BE: xxssumudmc 8, 3, 4, 1        # encoding: [0xed,0x03,0x25,0xc8]
+#CHECK-LE: xxssumudmc 8, 3, 4, 1        # encoding: [0xc8,0x25,0x03,0xed]
+
+           xxssumudmcext 8, 3, 4, 6, 0
+# CHECK-BE: xxssumudmcext 8, 3, 4, 6, 0 # encoding: [0x05,0x00,0x00,0x00,
+# CHECK-BE-SAME:                                     0x89,0x03,0x21,0xa0]
+# CHECK-LE: xxssumudmcext 8, 3, 4, 6, 0 # encoding: [0x00,0x00,0x00,0x05,
+# CHECK-LE-SAME:                                     0xa0,0x21,0x03,0x89]
+
+           xsaddadduqm  4, 5, 7
+#CHECK-BE: xsaddadduqm  4, 5, 7         # encoding: [0xec,0x85,0x3b,0x00]
+#CHECK-LE: xsaddadduqm  4, 5, 7         # encoding: [0x00,0x3b,0x85,0xec]
+
+           xsaddaddsuqm  4, 5, 7
+#CHECK-BE: xsaddaddsuqm  4, 5, 7        # encoding: [0xec,0x85,0x3b,0x40]
+#CHECK-LE: xsaddaddsuqm  4, 5, 7        # encoding: [0x40,0x3b,0x85,0xec]
+
+           xsaddsubuqm  4, 5, 7
+#CHECK-BE: xsaddsubuqm  4, 5, 7         # encoding: [0xec,0x85,0x3b,0x80]
+#CHECK-LE: xsaddsubuqm  4, 5, 7         # encoding: [0x80,0x3b,0x85,0xec]
+
+           xsaddsubsuqm  4, 5, 7
+#CHECK-BE: xsaddsubsuqm  4, 5, 7        # encoding: [0xec,0x85,0x3f,0x00]
+#CHECK-LE: xsaddsubsuqm  4, 5, 7        # encoding: [0x00,0x3f,0x85,0xec]
+
+           xsrebase2t1uqm 4, 5, 7
+#CHECK-BE: xsrebase2t1uqm 4, 5, 7       # encoding: [0xec,0x85,0x3c,0x88]
+#CHECK-LE: xsrebase2t1uqm 4, 5, 7       # encoding: [0x88,0x3c,0x85,0xec]
+
+           xsrebase2t2uqm 4, 5, 7
+#CHECK-BE: xsrebase2t2uqm 4, 5, 7       # encoding: [0xec,0x85,0x3d,0x88]
+#CHECK-LE: xsrebase2t2uqm 4, 5, 7       # encoding: [0x88,0x3d,0x85,0xec]
+
+           xsrebase2t3uqm 4, 5, 7
+#CHECK-BE: xsrebase2t3uqm 4, 5, 7       # encoding: [0xec,0x85,0x3e,0x88]
+#CHECK-LE: xsrebase2t3uqm 4, 5, 7       # encoding: [0x88,0x3e,0x85,0xec]
+
+           xsrebase2t4uqm 4, 5, 7
+#CHECK-BE: xsrebase2t4uqm 4, 5, 7       # encoding: [0xec,0x85,0x3e,0xc8]
+#CHECK-LE: xsrebase2t4uqm 4, 5, 7       # encoding: [0xc8,0x3e,0x85,0xec]
+
+           xsrebase3t1uqm 4, 5, 7
+#CHECK-BE: xsrebase3t1uqm 4, 5, 7       # encoding: [0xec,0x85,0x3f,0x88]
+#CHECK-LE: xsrebase3t1uqm 4, 5, 7       # encoding: [0x88,0x3f,0x85,0xec]
+
+           xsrebase3t2uqm 4, 5, 7
+#CHECK-BE: xsrebase3t2uqm 4, 5, 7       # encoding: [0xec,0x85,0x3f,0xc8]
+#CHECK-LE: xsrebase3t2uqm 4, 5, 7       # encoding: [0xc8,0x3f,0x85,0xec]
+
+           xsrebase3t3uqm 4, 5, 7
+#CHECK-BE: xsrebase3t3uqm 4, 5, 7       # encoding: [0xec,0x85,0x3e,0x18]
+#CHECK-LE: xsrebase3t3uqm 4, 5, 7       # encoding: [0x18,0x3e,0x85,0xec]
+
+           xsmerge2t1uqm 4, 5, 7
+#CHECK-BE: xsmerge2t1uqm 4, 5, 7        # encoding: [0xec,0x85,0x3f,0x40]
+#CHECK-LE: xsmerge2t1uqm 4, 5, 7        # encoding: [0x40,0x3f,0x85,0xec]
+
+           xsmerge2t2uqm 4, 5, 7
+#CHECK-BE: xsmerge2t2uqm 4, 5, 7        # encoding: [0xec,0x85,0x3f,0x80]
+#CHECK-LE: xsmerge2t2uqm 4, 5, 7        # encoding: [0x80,0x3f,0x85,0xec]
+
+           xsmerge2t3uqm 4, 5, 7
+#CHECK-BE: xsmerge2t3uqm 4, 5, 7        # encoding: [0xec,0x85,0x3a,0xc8]
+#CHECK-LE: xsmerge2t3uqm 4, 5, 7        # encoding: [0xc8,0x3a,0x85,0xec]
+
+           xsmerge3t1uqm 4, 5, 7
+#CHECK-BE: xsmerge3t1uqm 4, 5, 7        # encoding: [0xec,0x85,0x3b,0xc8]
+#CHECK-LE: xsmerge3t1uqm 4, 5, 7        # encoding: [0xc8,0x3b,0x85,0xec]
diff --git a/llvm/test/Transforms/AggressiveInstCombine/memchr.ll b/llvm/test/Transforms/AggressiveInstCombine/memchr.ll
index b26320b..6fbe960 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/memchr.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/memchr.ll
@@ -6,9 +6,10 @@
 
 declare ptr @memchr(ptr, i32, i64)
 
-define i1 @test_memchr_null(i32 %x) {
+define i1 @test_memchr_null(i32 %x) !prof !0 {
 ; CHECK-LABEL: define i1 @test_memchr_null(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]]) 
+; CHECK: !prof [[PROF_0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[X]] to i8
 ; CHECK-NEXT:    switch i8 [[TMP0]], label %[[ENTRY_SPLIT:.*]] [
@@ -40,9 +41,10 @@ entry:
   ret i1 %isnull
 }
 
-define ptr @test_memchr(i32 %x) {
+define ptr @test_memchr(i32 %x) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[X]] to i8
 ; CHECK-NEXT:    switch i8 [[TMP0]], label %[[ENTRY_SPLIT:.*]] [
@@ -72,16 +74,17 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_smaller_n(i32 %x) {
+define ptr @test_memchr_smaller_n(i32 %x) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_smaller_n(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[X]] to i8
 ; CHECK-NEXT:    switch i8 [[TMP0]], label %[[ENTRY_SPLIT:.*]] [
 ; CHECK-NEXT:      i8 48, label %[[MEMCHR_CASE:.*]]
 ; CHECK-NEXT:      i8 49, label %[[MEMCHR_CASE1:.*]]
 ; CHECK-NEXT:      i8 0, label %[[MEMCHR_CASE2:.*]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:    ], !prof [[PROF_1:![0-9]+]]
 ; CHECK:       [[MEMCHR_CASE]]:
 ; CHECK-NEXT:    br label %[[MEMCHR_SUCCESS:.*]]
 ; CHECK:       [[MEMCHR_CASE1]]:
@@ -103,9 +106,10 @@ entry:
 
 ; negative tests
 
-define ptr @test_memchr_larger_n(i32 %x) {
+define ptr @test_memchr_larger_n(i32 %x) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_larger_n(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]])
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 [[X]], i64 6)
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -115,9 +119,10 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_non_constant(i32 %x, ptr %str) {
+define ptr @test_memchr_non_constant(i32 %x, ptr %str) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_non_constant(
-; CHECK-SAME: i32 [[X:%.*]], ptr [[STR:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]], ptr [[STR:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr [[STR]], i32 [[X]], i64 5)
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -127,8 +132,9 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_constant_ch() {
-; CHECK-LABEL: define ptr @test_memchr_constant_ch() {
+define ptr @test_memchr_constant_ch() !prof !0 {
+; CHECK-LABEL: define ptr @test_memchr_constant_ch() 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 49, i64 5)
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -138,9 +144,10 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_dynamic_n(i32 %x, i32 %y) {
+define ptr @test_memchr_dynamic_n(i32 %x, i32 %y) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_dynamic_n(
-; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 [[X]], i32 [[Y]])
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -150,9 +157,10 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_long(i32 %x) {
+define ptr @test_memchr_long(i32 %x) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_long(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]])
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str_long, i32 [[X]], i64 8)
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -163,9 +171,10 @@ entry:
 }
 
 ; We want to check that the compiler still calls memchr if the length is non-constant:
-define ptr @test_memchr_non_constant_length2(i32 %x, i64 %len) {
+define ptr @test_memchr_non_constant_length2(i32 %x, i64 %len) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_non_constant_length2(
-; CHECK-SAME: i32 [[X:%.*]], i64 [[LEN:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]], i64 [[LEN:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 [[X]], i64 [[LEN]])
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -174,3 +183,7 @@ entry:
   %memchr = call ptr @memchr(ptr @str, i32 %x, i64 %len)
   ret ptr %memchr
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"unknown", !"aggressive-instcombine"}
+\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cleanup-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/X86/cleanup-runtime-checks.ll
new file mode 100644
index 0000000..41753f7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/cleanup-runtime-checks.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+declare ptr @get()
+declare i1 @cond()
+
+; Make sure we can clean up the created runtime checks, if vectorization isn't
+; profitable.
+define void @widget(i32 %arg, i64 %arg1, ptr %src) #0 {
+; CHECK-LABEL: define void @widget(
+; CHECK-SAME: i32 [[ARG:%.*]], i64 [[ARG1:%.*]], ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER:.*]]
+; CHECK:       [[LOOP_1_HEADER]]:
+; CHECK-NEXT:    br label %[[INNER_1:.*]]
+; CHECK:       [[INNER_1]]:
+; CHECK-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_1]], label %[[INNER_2:.*]], label %[[INNER_1]]
+; CHECK:       [[INNER_2]]:
+; CHECK-NEXT:    [[LOAD:%.*]] = call ptr @get()
+; CHECK-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_2]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1_LATCH:.*]]
+; CHECK:       [[LOOP_2_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_1_LATCH]]:
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP_2]] ], [ [[ARG]], %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[PHI8:%.*]] = phi i32 [ [[OR:%.*]], %[[LOOP_2]] ], [ 99, %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[OR]] = or i32 [[PHI8]], [[L]]
+; CHECK-NEXT:    store i32 [[OR]], ptr [[LOAD]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 100
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_2]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1.header
+
+loop.1.header:
+  br label %inner.1
+
+inner.1:
+  %c.1 = call i1 @cond()
+  br i1 %c.1, label %inner.2, label %inner.1
+
+inner.2:
+  %load = call ptr @get()
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %loop.2, label %loop.1.latch
+
+loop.1.latch:
+  br label %loop.1.header
+
+loop.2:
+  %iv = phi i32 [ %arg, %inner.2 ], [ %iv.next, %loop.2 ]
+  %phi8 = phi i32 [ 99, %inner.2 ], [ %or, %loop.2 ]
+  %gep.src = getelementptr i32, ptr  %src, i32 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %or = or i32 %phi8, %l
+  store i32 %or, ptr %load, align 4
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, 100
+  br i1 %ec, label %exit, label %loop.2, !prof !0
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+avx2" }
+!0 = !{!"branch_weights", i32 89478484, i32 1879048192}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 89478484, i32 1879048192}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll
index fade726..f33437f 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45259.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll
@@ -10,16 +10,15 @@ define i8 @widget(ptr %arr, i8 %t9) {
 ; CHECK-NEXT:    br label [[BB6:%.*]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    [[T1_0:%.*]] = phi ptr [ [[ARR]], [[BB:%.*]] ], [ null, [[BB6]] ]
+; CHECK-NEXT:    [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0]] to i64
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]]
 ; CHECK:       for.preheader:
-; CHECK-NEXT:    [[T1_0_LCSSA:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
 ; CHECK-NEXT:    [[T1_0_LCSSA4:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
 ; CHECK-NEXT:    [[T1_0_LCSSA1:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
-; CHECK-NEXT:    [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64
-; CHECK-NEXT:    [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA4]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[TMP0]]
+; CHECK-NEXT:    [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA4]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA3]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
index 047d36b..b9cb1cb 100644
--- a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
@@ -28,18 +28,15 @@ define void @f() {
 ; CHECK:       outer.latch:
 ; CHECK-NEXT:    br label [[OUTER_HEADER]]
 ; CHECK:       outer.exit.0:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP0]], [[OUTER_HEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       outer.exit.1:
-; CHECK-NEXT:    [[DOTLCSSA1:%.*]] = phi ptr [ [[TMP0]], [[INNER_1_LATCH]] ]
 ; CHECK-NEXT:    br label [[LOOP_PREHEADER]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[DOTLCSSA]], [[OUTER_EXIT_0]] ], [ [[DOTLCSSA1]], [[OUTER_EXIT_1]] ]
 ; CHECK-NEXT:    br label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr @f.e, [[SCEVGEP]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TMP1]], getelementptr inbounds nuw (i8, ptr @f.e, i64 4)
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TMP0]], getelementptr inbounds nuw (i8, ptr @f.e, i64 4)
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -59,7 +56,7 @@ define void @f() {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[CONV6_US_US_US:%.*]] = zext i1 false to i32
 ; CHECK-NEXT:    store i32 [[CONV6_US_US_US]], ptr @f.e, align 1
-; CHECK-NEXT:    store i8 10, ptr [[TMP1]], align 1
+; CHECK-NEXT:    store i8 10, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 73d5e26..5894c3a 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -109,14 +109,13 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[DST_1]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @val()
 ; CHECK-NEXT:    [[SEL_DST:%.*]] = select i1 [[C]], ptr [[DST_1]], ptr [[DST_2]]
+; CHECK-NEXT:    [[SEL_DST_LCSSA12:%.*]] = ptrtoint ptr [[SEL_DST]] to i64
 ; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 1
 ; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    br i1 [[EC_1]], label %[[LOOP_2_HEADER_PREHEADER:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_2_HEADER_PREHEADER]]:
-; CHECK-NEXT:    [[SEL_DST_LCSSA1:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[PTR_IV_1_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[SEL_DST_LCSSA:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[SEL_DST_LCSSA12:%.*]] = ptrtoint ptr [[SEL_DST_LCSSA1]] to i64
 ; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[PTR_IV_1_LCSSA]] to i64
@@ -140,13 +139,13 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1023, %[[MIDDLE_BLOCK]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_LCSSA]], %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[SEL_DST_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[SEL_DST_LCSSA]], %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
 ; CHECK:       [[LOOP_2_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[DEC7:%.*]], %[[LOOP_2_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i32 [[IV]], 1024
 ; CHECK-NEXT:    br i1 [[EC_2]], label %[[EXIT:.*]], label %[[LOOP_2_LATCH]]
 ; CHECK:       [[LOOP_2_LATCH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll b/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll
new file mode 100644
index 0000000..826696f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll
@@ -0,0 +1,182 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define i32 @preserve_inbounds(i64 %start, ptr %ptr) {
+; CHECK-LABEL: define i32 @preserve_inbounds(
+; CHECK-SAME: i64 [[START:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND:%.*]] = phi i64 [ [[START]], %[[SCALAR_PH]] ], [ [[REV_IND_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REDUX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[REDUX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND_NEXT]] = add i64 [[REV_IND]], -1
+; CHECK-NEXT:    [[GEP_PTR_IND:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[REV_IND_NEXT]]
+; CHECK-NEXT:    [[LD_PTR:%.*]] = load i32, ptr [[GEP_PTR_IND]], align 4
+; CHECK-NEXT:    [[REDUX_NEXT]] = add i32 [[LD_PTR]], [[REDUX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[REDUX_NEXT_LCSSA:%.*]] = phi i32 [ [[REDUX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[REDUX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %rev.ind = phi i64 [ %start, %entry ], [ %rev.ind.next, %loop ]
+  %redux = phi i32 [ 0, %entry ], [ %redux.next, %loop ]
+  %rev.ind.next = add i64 %rev.ind, -1
+  %gep.ptr.ind = getelementptr inbounds i32, ptr %ptr, i64 %rev.ind.next
+  %ld.ptr = load i32, ptr %gep.ptr.ind, align 4
+  %redux.next = add i32 %ld.ptr, %redux
+  %iv.next = add i32 %iv, 1
+  %exit.cond = icmp ne i32 %iv.next, 1024
+  br i1 %exit.cond, label %loop, label %end
+
+end:
+  ret i32 %redux.next
+}
+
+define i32 @preserve_nusw(i64 %start, ptr %ptr) {
+; CHECK-LABEL: define i32 @preserve_nusw(
+; CHECK-SAME: i64 [[START:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr nusw i32, ptr [[PTR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr nusw i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr nusw i32, ptr [[TMP2]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND:%.*]] = phi i64 [ [[START]], %[[SCALAR_PH]] ], [ [[REV_IND_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REDUX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[REDUX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND_NEXT]] = add i64 [[REV_IND]], -1
+; CHECK-NEXT:    [[GEP_PTR_IND:%.*]] = getelementptr nusw i32, ptr [[PTR]], i64 [[REV_IND_NEXT]]
+; CHECK-NEXT:    [[LD_PTR:%.*]] = load i32, ptr [[GEP_PTR_IND]], align 4
+; CHECK-NEXT:    [[REDUX_NEXT]] = add i32 [[LD_PTR]], [[REDUX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[REDUX_NEXT_LCSSA:%.*]] = phi i32 [ [[REDUX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[REDUX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %rev.ind = phi i64 [ %start, %entry ], [ %rev.ind.next, %loop ]
+  %redux = phi i32 [ 0, %entry ], [ %redux.next, %loop ]
+  %rev.ind.next = add i64 %rev.ind, -1
+  %gep.ptr.ind = getelementptr nusw i32, ptr %ptr, i64 %rev.ind.next
+  %ld.ptr = load i32, ptr %gep.ptr.ind, align 4
+  %redux.next = add i32 %ld.ptr, %redux
+  %iv.next = add i32 %iv, 1
+  %exit.cond = icmp ne i32 %iv.next, 1024
+  br i1 %exit.cond, label %loop, label %end
+
+end:
+  ret i32 %redux.next
+}
+
+define i32 @drop_nuw(i64 %start, ptr %ptr) {
+; CHECK-LABEL: define i32 @drop_nuw(
+; CHECK-SAME: i64 [[START:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr nuw i32, ptr [[PTR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND:%.*]] = phi i64 [ [[START]], %[[SCALAR_PH]] ], [ [[REV_IND_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REDUX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[REDUX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND_NEXT]] = add i64 [[REV_IND]], -1
+; CHECK-NEXT:    [[GEP_PTR_IND:%.*]] = getelementptr nuw i32, ptr [[PTR]], i64 [[REV_IND_NEXT]]
+; CHECK-NEXT:    [[LD_PTR:%.*]] = load i32, ptr [[GEP_PTR_IND]], align 4
+; CHECK-NEXT:    [[REDUX_NEXT]] = add i32 [[LD_PTR]], [[REDUX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[REDUX_NEXT_LCSSA:%.*]] = phi i32 [ [[REDUX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[REDUX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %rev.ind = phi i64 [ %start, %entry ], [ %rev.ind.next, %loop ]
+  %redux = phi i32 [ 0, %entry ], [ %redux.next, %loop ]
+  %rev.ind.next = add i64 %rev.ind, -1
+  %gep.ptr.ind = getelementptr nuw i32, ptr %ptr, i64 %rev.ind.next
+  %ld.ptr = load i32, ptr %gep.ptr.ind, align 4
+  %redux.next = add i32 %ld.ptr, %redux
+  %iv.next = add i32 %iv, 1
+  %exit.cond = icmp ne i32 %iv.next, 1024
+  br i1 %exit.cond, label %loop, label %end
+
+end:
+  ret i32 %redux.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
index 9c14a8c..1e4598e 100644
--- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
@@ -23,18 +23,16 @@ define i16 @test(ptr %arg, i64 %N) {
 ; CHECK-NEXT:    [[C_3:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C_3]], label [[LOOP_3_PREHEADER:%.*]], label [[INNER_LATCH:%.*]]
 ; CHECK:       loop.3.preheader:
-; CHECK-NEXT:    [[L_1_LCSSA:%.*]] = phi ptr [ [[L_1]], [[INNER_BB]] ]
-; CHECK-NEXT:    [[L_2_LCSSA:%.*]] = phi ptr [ [[L_2]], [[INNER_BB]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[L_2_LCSSA]], i64 2
-; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[L_1_LCSSA]], i64 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[L_2]], i64 2
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[L_1]], i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
-; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[L_1_LCSSA]], i64 [[TMP2]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[L_2_LCSSA]], [[SCEVGEP6]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[L_1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[L_2]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP5]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
@@ -67,19 +65,17 @@ define i16 @test(ptr %arg, i64 %N) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[C_5:%.*]] = icmp ult i64 [[IV]], [[N]]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i16, ptr [[L_1_LCSSA]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i16, ptr [[L_1]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[LOOP_L_1:%.*]] = load i16, ptr [[GEP_1]], align 2
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i16, ptr [[L_2_LCSSA]], i64 0
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i16, ptr [[L_2]], i64 0
 ; CHECK-NEXT:    store i16 [[LOOP_L_1]], ptr [[GEP_2]], align 2
 ; CHECK-NEXT:    br i1 [[C_5]], label [[LOOP_3]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit.loopexit1:
-; CHECK-NEXT:    [[L_1_LCSSA3:%.*]] = phi ptr [ [[L_1]], [[INNER_LATCH]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[L_14:%.*]] = phi ptr [ [[L_1_LCSSA3]], [[EXIT_LOOPEXIT1]] ], [ [[L_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT:    [[L_3:%.*]] = load i16, ptr [[L_14]], align 2
+; CHECK-NEXT:    [[L_3:%.*]] = load i16, ptr [[L_1]], align 2
 ; CHECK-NEXT:    ret i16 [[L_3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 4e6ef0d..5a0c69b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -580,6 +580,127 @@ exit:
   ret i32 %add
 }
 
+define i32 @print_mulacc_negated(ptr %a, ptr %b) {
+; CHECK-LABEL: 'print_mulacc_negated'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%3> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%3>, vp<%8>
+; CHECK-NEXT:     vp<%5> = SCALAR-STEPS vp<%4>, ir<1>, vp<%0>
+; CHECK-NEXT:     CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%5>
+; CHECK-NEXT:     vp<%6> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:     WIDEN ir<%load.a> = load vp<%6>
+; CHECK-NEXT:     CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%5>
+; CHECK-NEXT:     vp<%7> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:     WIDEN ir<%load.b> = load vp<%7>
+; CHECK-NEXT:     EXPRESSION vp<%8> = ir<%accum> + reduce.add (sub (0, mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32)))
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<%4>, vp<%1>
+; CHECK-NEXT:     EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%10> = compute-reduction-result ir<%accum>, vp<%8>
+; CHECK-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<1024>, vp<%2>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %loop ] (extra operand: vp<%10> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%10>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %entry ], [ %add, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
+; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
+; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
+; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
+; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
+; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
+; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
+; CHECK-NEXT:   IR   %sub = sub i32 0, %mul
+; CHECK-NEXT:   IR   %add = add i32 %accum, %sub
+; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 1024
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK:      VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in ir<1024> = vector-trip-count
+; CHECK-NEXT: Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add>
+; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load ir<%gep.a>
+; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load ir<%gep.b>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
+; CHECK-NEXT:   WIDEN ir<%sub> = sub ir<0>, ir<%mul>
+; CHECK-NEXT:   REDUCE ir<%add> = ir<%accum> + reduce.add (ir<%sub>)
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:   EMIT branch-on-count vp<%index.next>, ir<1024>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
+; CHECK-NEXT: Successor(s): ir-bb<exit>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[RED_RESULT]]> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %loop ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %sub = sub i32 0, %mul
+  %add = add i32 %accum, %sub
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret i32 %add
+}
+
 define i64 @print_mulacc_sub_extended(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
 ; CHECK-LABEL: 'print_mulacc_sub_extended'
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 645dbc4..4f52227 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -7,8 +7,8 @@ define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 16
-; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -28,22 +28,22 @@ define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -62,22 +62,22 @@ define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -87,9 +87,9 @@ define void @const_stride_1_with_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -109,22 +109,22 @@ define void @const_stride_1_with_reordering(ptr %pl, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -144,22 +144,22 @@ define void @const_stride_1_with_reordering(ptr %pl, ptr %ps) {
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
   ; NOTE: value from %load1 in stored in  %gep_s0
-  store i8 %load1, ptr %gep_s0, align 16
-  store i8 %load0, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load1, ptr %gep_s0, align 1
+  store i8 %load0, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -170,9 +170,9 @@ define void @const_stride_2_no_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 16, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 1, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <31 x i8> [[TMP2]], <31 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -192,22 +192,22 @@ define void @const_stride_2_no_reordering(ptr %pl, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 28
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 30
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -226,22 +226,22 @@ define void @const_stride_2_no_reordering(ptr %pl, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -251,10 +251,10 @@ define void @const_stride_2_with_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 16, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 1, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> <i32 2, i32 0, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -274,22 +274,22 @@ define void @const_stride_2_with_reordering(ptr %pl, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 28
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 30
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -308,22 +308,22 @@ define void @const_stride_2_with_reordering(ptr %pl, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load1, ptr %gep_s0, align 16
-  store i8 %load0, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load1, ptr %gep_s0, align 1
+  store i8 %load0, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -335,8 +335,8 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE0]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 16 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
-; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %stride0  = mul nsw i64 %stride, 0
@@ -373,22 +373,22 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 %stride14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 %stride15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -407,22 +407,22 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -434,9 +434,9 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE0]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 16 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %stride0  = mul nsw i64 %stride, 0
@@ -473,22 +473,22 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 %stride14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 %stride15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -507,22 +507,22 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load1, ptr %gep_s0, align 16
-  store i8 %load0, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load1, ptr %gep_s0, align 1
+  store i8 %load0, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -531,9 +531,9 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 %offset0
 ; %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
-; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 %gep_l0, i64 8, <4 x i1> splat (i1 true), i32 4)
+; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 %gep_l0, i64 8, <4 x i1> splat (i1 true), i32 4)
 ; %bitcast_ = bitcast <4 x i32> %strided_load to <16 x i8>
-; store <16 x i8> %bitcast_, ptr %gep_s0, align 16
+; store <16 x i8> %bitcast_, ptr %gep_s0, align 1
 ; ret void
 ; }
 define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
@@ -541,9 +541,9 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr [[GEP_L0]], i32 16, <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr [[GEP_L0]], i32 1, <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <28 x i8> [[TMP1]], <28 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -563,22 +563,22 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 26
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 27
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -597,22 +597,22 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -621,9 +621,9 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
 ; define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 %offset0
 ; %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
-; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 %gep_l0, i64 %stride, <4 x i1> splat (i1 true), i32 4)
+; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 %gep_l0, i64 %stride, <4 x i1> splat (i1 true), i32 4)
 ; %bitcast_ = bitcast <4 x i32> %strided_load to <16 x i8>
-; store <16 x i8> %bitcast_, ptr %gep_s0, align 16
+; store <16 x i8> %bitcast_, ptr %gep_s0, align 1
 ; ret void
 ; }
 define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
@@ -638,10 +638,10 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-NEXT:    [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET8]]
 ; CHECK-NEXT:    [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET12]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -649,7 +649,7 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %offset0  = mul nsw i64 %stride, 0
@@ -686,22 +686,22 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 %offset14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 %offset15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -720,22 +720,22 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/test-delete-tree.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/test-delete-tree.ll
new file mode 100644
index 0000000..c4e6c4e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/test-delete-tree.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v -passes=slp-vectorizer -S < %s | FileCheck %s
+
+; CHECK-NOT: TreeEntryToStridedPtrInfoMap is not cleared
+define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
+; CHECK-LABEL: define void @const_stride_1_no_reordering(
+; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
+; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT:    ret void
+;
+  %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
+  %gep_l1 = getelementptr inbounds i8, ptr %pl, i64 1
+  %gep_l2 = getelementptr inbounds i8, ptr %pl, i64 2
+  %gep_l3 = getelementptr inbounds i8, ptr %pl, i64 3
+  %gep_l4 = getelementptr inbounds i8, ptr %pl, i64 4
+  %gep_l5 = getelementptr inbounds i8, ptr %pl, i64 5
+  %gep_l6 = getelementptr inbounds i8, ptr %pl, i64 6
+  %gep_l7 = getelementptr inbounds i8, ptr %pl, i64 7
+  %gep_l8 = getelementptr inbounds i8, ptr %pl, i64 8
+  %gep_l9 = getelementptr inbounds i8, ptr %pl, i64 9
+  %gep_l10 = getelementptr inbounds i8, ptr %pl, i64 10
+  %gep_l11 = getelementptr inbounds i8, ptr %pl, i64 11
+  %gep_l12 = getelementptr inbounds i8, ptr %pl, i64 12
+  %gep_l13 = getelementptr inbounds i8, ptr %pl, i64 13
+  %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 14
+  %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 15
+
+  %load0  = load i8, ptr %gep_l0
+  %load1  = load i8, ptr %gep_l1
+  %load2  = load i8, ptr %gep_l2
+  %load3  = load i8, ptr %gep_l3
+  %load4  = load i8, ptr %gep_l4
+  %load5  = load i8, ptr %gep_l5
+  %load6  = load i8, ptr %gep_l6
+  %load7  = load i8, ptr %gep_l7
+  %load8  = load i8, ptr %gep_l8
+  %load9  = load i8, ptr %gep_l9
+  %load10 = load i8, ptr %gep_l10
+  %load11 = load i8, ptr %gep_l11
+  %load12 = load i8, ptr %gep_l12
+  %load13 = load i8, ptr %gep_l13
+  %load14 = load i8, ptr %gep_l14
+  %load15 = load i8, ptr %gep_l15
+
+  %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
+  %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
+  %gep_s2 = getelementptr inbounds i8, ptr %ps, i64 2
+  %gep_s3 = getelementptr inbounds i8, ptr %ps, i64 3
+  %gep_s4 = getelementptr inbounds i8, ptr %ps, i64 4
+  %gep_s5 = getelementptr inbounds i8, ptr %ps, i64 5
+  %gep_s6 = getelementptr inbounds i8, ptr %ps, i64 6
+  %gep_s7 = getelementptr inbounds i8, ptr %ps, i64 7
+  %gep_s8 = getelementptr inbounds i8, ptr %ps, i64 8
+  %gep_s9 = getelementptr inbounds i8, ptr %ps, i64 9
+  %gep_s10 = getelementptr inbounds i8, ptr %ps, i64 10
+  %gep_s11 = getelementptr inbounds i8, ptr %ps, i64 11
+  %gep_s12 = getelementptr inbounds i8, ptr %ps, i64 12
+  %gep_s13 = getelementptr inbounds i8, ptr %ps, i64 13
+  %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
+  %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
+
+  store i8 %load0, ptr %gep_s0
+  store i8 %load1, ptr %gep_s1
+  store i8 %load2, ptr %gep_s2
+  store i8 %load3, ptr %gep_s3
+  store i8 %load4, ptr %gep_s4
+  store i8 %load5, ptr %gep_s5
+  store i8 %load6, ptr %gep_s6
+  store i8 %load7, ptr %gep_s7
+  store i8 %load8, ptr %gep_s8
+  store i8 %load9, ptr %gep_s9
+  store i8 %load10, ptr %gep_s10
+  store i8 %load11, ptr %gep_s11
+  store i8 %load12, ptr %gep_s12
+  store i8 %load13, ptr %gep_s13
+  store i8 %load14, ptr %gep_s14
+  store i8 %load15, ptr %gep_s15
+
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
index 79e72aa..38c624e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
@@ -357,7 +357,7 @@ define <4 x i32> @or_sext_v4i8_to_v4i32_constant_with_loss(<4 x i8> %a) {
 define <4 x i16> @and_trunc_nuw_nsw_constant(<4 x i32> %a) {
 ; CHECK-LABEL: @and_trunc_nuw_nsw_constant(
 ; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], <i32 1, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i16>
+; CHECK-NEXT:    [[AND:%.*]] = trunc nuw nsw <4 x i32> [[AND_INNER]] to <4 x i16>
 ; CHECK-NEXT:    ret <4 x i16> [[AND]]
 ;
   %t1 = trunc nuw nsw <4 x i32> %a to <4 x i16>
@@ -368,7 +368,7 @@ define <4 x i16> @and_trunc_nuw_nsw_constant(<4 x i32> %a) {
 define <4 x i8> @and_trunc_nuw_nsw_minus_constant(<4 x i32> %a) {
 ; CHECK-LABEL: @and_trunc_nuw_nsw_minus_constant(
 ; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], <i32 240, i32 241, i32 242, i32 243>
-; CHECK-NEXT:    [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i8>
+; CHECK-NEXT:    [[AND:%.*]] = trunc nuw <4 x i32> [[AND_INNER]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[AND]]
 ;
   %t1 = trunc nuw nsw <4 x i32> %a to <4 x i8>
@@ -379,7 +379,7 @@ define <4 x i8> @and_trunc_nuw_nsw_minus_constant(<4 x i32> %a) {
 define <4 x i8> @and_trunc_nuw_nsw_multiconstant(<4 x i32> %a) {
 ; CHECK-LABEL: @and_trunc_nuw_nsw_multiconstant(
 ; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], <i32 240, i32 1, i32 242, i32 3>
-; CHECK-NEXT:    [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i8>
+; CHECK-NEXT:    [[AND:%.*]] = trunc nuw <4 x i32> [[AND_INNER]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[AND]]
 ;
   %t1 = trunc nuw nsw <4 x i32> %a to <4 x i8>
@@ -391,7 +391,7 @@ define <4 x i8> @and_trunc_nuw_nsw_multiconstant(<4 x i32> %a) {
 define <4 x i32> @or_zext_nneg_constant(<4 x i16> %a) {
 ; CHECK-LABEL: @or_zext_nneg_constant(
 ; CHECK-NEXT:    [[OR_INNER:%.*]] = or <4 x i16> [[A:%.*]], <i16 1, i16 2, i16 3, i16 4>
-; CHECK-NEXT:    [[OR:%.*]] = zext <4 x i16> [[OR_INNER]] to <4 x i32>
+; CHECK-NEXT:    [[OR:%.*]] = zext nneg <4 x i16> [[OR_INNER]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[OR]]
 ;
   %z1 = zext nneg <4 x i16> %a to <4 x i32>
diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp
index 49af836..cfd99ed 100644
--- a/llvm/unittests/IR/IntrinsicsTest.cpp
+++ b/llvm/unittests/IR/IntrinsicsTest.cpp
@@ -189,4 +189,12 @@ TEST_F(IntrinsicsTest, InstrProfInheritance) {
   }
 }
 
+// Check that getFnAttributes for intrinsics that do not have any function
+// attributes correcty returns an empty set.
+TEST(IntrinsicAttributes, TestGetFnAttributesBug) {
+  using namespace Intrinsic;
+  LLVMContext Context;
+  AttributeSet AS = getFnAttributes(Context, experimental_guard);
+  EXPECT_FALSE(AS.hasAttributes());
+}
 } // end namespace
diff --git a/llvm/unittests/Support/MustacheTest.cpp b/llvm/unittests/Support/MustacheTest.cpp
index 0ebbc58..83f6e9a 100644
--- a/llvm/unittests/Support/MustacheTest.cpp
+++ b/llvm/unittests/Support/MustacheTest.cpp
@@ -1335,7 +1335,7 @@ TEST(MustacheDelimiters, PairBehavior) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("(Hey!)", Out);
+  EXPECT_EQ("(Hey!)", Out);
 }
 
 TEST(MustacheDelimiters, SpecialCharacters) {
@@ -1344,7 +1344,7 @@ TEST(MustacheDelimiters, SpecialCharacters) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("(It worked!)", Out);
+  EXPECT_EQ("(It worked!)", Out);
 }
 
 TEST(MustacheDelimiters, Sections) {
@@ -1355,7 +1355,7 @@ TEST(MustacheDelimiters, Sections) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
+  EXPECT_EQ("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
             "interpolated.\n]\n",
             Out);
 }
@@ -1368,7 +1368,7 @@ TEST(MustacheDelimiters, InvertedSections) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
+  EXPECT_EQ("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
             "interpolated.\n]\n",
             Out);
 }
@@ -1380,7 +1380,7 @@ TEST(MustacheDelimiters, PartialInheritence) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("[ .yes. ]\n[ .yes. ]\n", Out);
+  EXPECT_EQ("[ .yes. ]\n[ .yes. ]\n", Out);
 }
 
 TEST(MustacheDelimiters, PostPartialBehavior) {
@@ -1390,7 +1390,7 @@ TEST(MustacheDelimiters, PostPartialBehavior) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("[ .yes.  .yes. ]\n[ .yes.  .|value|. ]\n", Out);
+  EXPECT_EQ("[ .yes.  .yes. ]\n[ .yes.  .|value|. ]\n", Out);
 }
 
 TEST(MustacheDelimiters, SurroundingWhitespace) {
@@ -1417,7 +1417,7 @@ TEST(MustacheDelimiters, StandaloneTag) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("Begin.\nEnd.\n", Out);
+  EXPECT_EQ("Begin.\nEnd.\n", Out);
 }
 
 TEST(MustacheDelimiters, IndentedStandaloneTag) {
@@ -1426,7 +1426,7 @@ TEST(MustacheDelimiters, IndentedStandaloneTag) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("Begin.\nEnd.\n", Out);
+  EXPECT_EQ("Begin.\nEnd.\n", Out);
 }
 
 TEST(MustacheDelimiters, StandaloneLineEndings) {
@@ -1435,7 +1435,7 @@ TEST(MustacheDelimiters, StandaloneLineEndings) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("|\r\n|", Out);
+  EXPECT_EQ("|\r\n|", Out);
 }
 
 TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
@@ -1444,7 +1444,7 @@ TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("=", Out);
+  EXPECT_EQ("=", Out);
 }
 
 TEST(MustacheDelimiters, StandaloneWithoutNewline) {
@@ -1453,7 +1453,7 @@ TEST(MustacheDelimiters, StandaloneWithoutNewline) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("=\n", Out);
+  EXPECT_EQ("=\n", Out);
 }
 
 TEST(MustacheDelimiters, PairwithPadding) {
diff --git a/llvm/unittests/Support/TypeSizeTest.cpp b/llvm/unittests/Support/TypeSizeTest.cpp
index b02b7e6..018b240 100644
--- a/llvm/unittests/Support/TypeSizeTest.cpp
+++ b/llvm/unittests/Support/TypeSizeTest.cpp
@@ -58,6 +58,7 @@ static_assert(ElementCount::getFixed(8).divideCoefficientBy(2) ==
 static_assert(ElementCount::getFixed(8).multiplyCoefficientBy(3) ==
               ElementCount::getFixed(24));
 static_assert(ElementCount::getFixed(8).isKnownMultipleOf(2));
+static_assert(!ElementCount::getFixed(8).isKnownMultipleOf(0));
 
 constexpr TypeSize TSFixed0 = TypeSize::getFixed(0);
 constexpr TypeSize TSFixed1 = TypeSize::getFixed(1);
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 559868d..75dffb1 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -794,12 +794,15 @@ AttributeSet Intrinsic::getFnAttributes(LLVMContext &C, ID id) {{
   if (id == 0)
     return AttributeSet();
   auto [FnAttrID, _] = unpackID(IntrinsicsToAttributesMap[id - 1]);
+  if (FnAttrID == {})
+    return AttributeSet();
   return getIntrinsicFnAttributeSet(C, FnAttrID);
 }
 #endif // GET_INTRINSIC_ATTRIBUTES
 
 )",
-                UniqAttributesBitSize, MaxNumAttrs, NoFunctionAttrsID);
+                UniqAttributesBitSize, MaxNumAttrs, NoFunctionAttrsID,
+                NoFunctionAttrsID);
 }
 
 void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(
diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
index ea1395b..bdcef37 100644
--- a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
+++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
@@ -54,20 +54,6 @@ static int NumXFail = 0;
 static int NumSuccess = 0;
 
 static const StringMap<StringSet<>> XFailTestNames = {{
-    {"delimiters.json",
-     {
-         "Pair Behavior",
-         "Special Characters",
-         "Sections",
-         "Inverted Sections",
-         "Partial Inheritence",
-         "Post-Partial Behavior",
-         "Standalone Tag",
-         "Indented Standalone Tag",
-         "Standalone Line Endings",
-         "Standalone Without Previous Line",
-         "Standalone Without Newline",
-     }},
     {"~dynamic-names.json",
      {
          "Basic Behavior - Partial",
@@ -113,7 +99,6 @@ static const StringMap<StringSet<>> XFailTestNames = {{
          "Block reindentation",
          "Intrinsic indentation",
          "Nested block reindentation",
-
      }},
     {"~lambdas.json",
      {
@@ -126,7 +111,6 @@ static const StringMap<StringSet<>> XFailTestNames = {{
          "Section - Expansion",
          "Section - Alternate Delimiters",
          "Section - Multiple Calls",
-
      }},
     {"partials.json", {"Standalone Indentation"}},
 }};
diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 208cbdd..fa6aec8 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -173,9 +173,32 @@ function(mlir_generate_type_stubs)
   if(ARG_VERBOSE)
     message(STATUS "Generating type-stubs outputs ${_generated_type_stubs}")
   endif()
+
+  # If PYTHONPATH is set and points to the build location of the python package then when stubgen runs, _mlir will get
+  # imported twice and bad things will happen (e.g., Assertion `!instance && “PyGlobals already constructed”’).
+  # This happens because mlir is a namespace package and the importer/loader can't distinguish between
+  # mlir._mlir_libs._mlir and _mlir_libs._mlir imported from CWD.
+  # So try to filter out any entries in PYTHONPATH that end in "MLIR_BINDINGS_PYTHON_INSTALL_PREFIX/.."
+  # (e.g., python_packages/mlir_core/).
+  set(_pythonpath "$ENV{PYTHONPATH}")
+  cmake_path(CONVERT "${MLIR_BINDINGS_PYTHON_INSTALL_PREFIX}/.." TO_NATIVE_PATH_LIST _install_prefix NORMALIZE)
+  if(WIN32)
+    set(_path_sep ";")
+    set(_trailing_sep "\\")
+  else()
+    set(_path_sep ":")
+    set(_trailing_sep "/")
+    # `;` is the CMake list delimiter so Windows paths are automatically lists
+    # and Unix paths can be made into lists by replacing `:` with `;`
+    string(REPLACE "${_path_sep}" ";" _pythonpath "${_pythonpath}")
+  endif()
+  string(REGEX REPLACE "${_trailing_sep}$" "" _install_prefix "${_install_prefix}")
+  list(FILTER _pythonpath EXCLUDE REGEX "(${_install_prefix}|${_install_prefix}${_trailing_sep})$")
+  # Note, ${_pythonpath} is a list but "${_pythonpath}" is not a list - it's a string with ";" chars in it.
+  string(JOIN "${_path_sep}" _pythonpath ${_pythonpath})
   add_custom_command(
     OUTPUT ${_generated_type_stubs}
-    COMMAND ${_nb_stubgen_cmd}
+    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH="${_pythonpath}" ${_nb_stubgen_cmd}
     WORKING_DIRECTORY "${CMAKE_CURRENT_FUNCTION_LIST_DIR}"
     DEPENDS "${ARG_DEPENDS_TARGETS}"
     DEPFILE "${_depfile}"
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
index 8d6c68c..ffd367e 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
@@ -53,7 +53,7 @@ reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
 ///   ValueBoundsOpInterface, no bound can be computed.
 FailureOr<OpFoldResult> reifyIndexValueBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
-    ValueBoundsConstraintSet::StopConditionFn stopCondition = nullptr,
+    const ValueBoundsConstraintSet::StopConditionFn &stopCondition = nullptr,
     bool closedUB = false);
 
 /// Reify a bound for the specified dimension of the given shaped value in terms
@@ -65,7 +65,7 @@ FailureOr<OpFoldResult> reifyIndexValueBound(
 FailureOr<OpFoldResult> reifyShapedValueDimBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     int64_t dim,
-    ValueBoundsConstraintSet::StopConditionFn stopCondition = nullptr,
+    const ValueBoundsConstraintSet::StopConditionFn &stopCondition = nullptr,
     bool closedUB = false);
 
 } // namespace arith
diff --git a/mlir/include/mlir/Dialect/Math/IR/MathOps.td b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
index cfd8c4b..af65af6 100644
--- a/mlir/include/mlir/Dialect/Math/IR/MathOps.td
+++ b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
@@ -511,6 +511,43 @@ def Math_SinhOp : Math_FloatUnaryOp<"sinh"> {
 }
 
 //===----------------------------------------------------------------------===//
+// SinCosOp
+//===----------------------------------------------------------------------===//
+
+def Math_SincosOp : Math_Op<"sincos",
+    [SameOperandsAndResultShape,
+     DeclareOpInterfaceMethods<ArithFastMathInterface>,
+     AllTypesMatch<["operand", "sin", "cos"]>]> {
+  let summary = "sine and cosine of the specified value";
+  let description = [{
+    The `sincos` operation computes both the sine and cosine of a given value
+    simultaneously. It takes one operand of floating point type (i.e., scalar,
+    tensor or vector) and returns two results of the same type. This operation
+    can be more efficient than computing sine and cosine separately when both
+    values are needed.
+
+    Example:
+
+    ```mlir
+    // Scalar sine and cosine values.
+    %sin, %cos = math.sincos %input : f64
+    ```
+  }];
+
+  let arguments = (ins FloatLike:$operand,
+      DefaultValuedAttr<Arith_FastMathAttr,
+                        "::mlir::arith::FastMathFlags::none">:$fastmath);
+  let results = (outs FloatLike:$sin, FloatLike:$cos);
+
+  let assemblyFormat = [{ $operand (`fastmath` `` $fastmath^)?
+                          attr-dict `:` type($operand) }];
+
+  let extraClassDeclaration = [{
+    std::optional<SmallVector<int64_t, 4>> getShapeForUnroll();
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // CountLeadingZerosOp
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
index 38c217f..468242d 100644
--- a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
+++ b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
@@ -468,11 +468,6 @@ def DotInt8Op : AVX_Op<"dot.i8", [Pure,
       intr += "." + std::to_string(opBitWidth);
       return intr;
     }
-
-    SmallVector<Value> getIntrinsicOperands(
-        ::mlir::ArrayRef<Value> operands,
-        const ::mlir::LLVMTypeConverter &typeConverter,
-        ::mlir::RewriterBase &rewriter);
   }];
 }
 
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index a73afbc..2285d26 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -20,20 +20,20 @@
 
 using namespace mlir;
 
-LLVM::LLVMFuncOp mlir::getOrDefineFunction(gpu::GPUModuleOp moduleOp,
-                                           Location loc, OpBuilder &b,
-                                           StringRef name,
+LLVM::LLVMFuncOp mlir::getOrDefineFunction(Operation *moduleOp, Location loc,
+                                           OpBuilder &b, StringRef name,
                                            LLVM::LLVMFunctionType type) {
-  LLVM::LLVMFuncOp ret;
-  if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
-    OpBuilder::InsertionGuard guard(b);
-    b.setInsertionPointToStart(moduleOp.getBody());
-    ret = LLVM::LLVMFuncOp::create(b, loc, name, type, LLVM::Linkage::External);
-  }
-  return ret;
+  auto existing = dyn_cast_or_null<LLVM::LLVMFuncOp>(
+      SymbolTable::lookupSymbolIn(moduleOp, name));
+  if (existing)
+    return existing;
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
+  return LLVM::LLVMFuncOp::create(b, loc, name, type, LLVM::Linkage::External);
 }
 
-static SmallString<16> getUniqueSymbolName(gpu::GPUModuleOp moduleOp,
+static SmallString<16> getUniqueSymbolName(Operation *moduleOp,
                                            StringRef prefix) {
   // Get a unique global name.
   unsigned stringNumber = 0;
@@ -41,15 +41,16 @@ static SmallString<16> getUniqueSymbolName(gpu::GPUModuleOp moduleOp,
   do {
     stringConstName.clear();
     (prefix + Twine(stringNumber++)).toStringRef(stringConstName);
-  } while (moduleOp.lookupSymbol(stringConstName));
+  } while (SymbolTable::lookupSymbolIn(moduleOp, stringConstName));
   return stringConstName;
 }
 
-LLVM::GlobalOp
-mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
-                                gpu::GPUModuleOp moduleOp, Type llvmI8,
-                                StringRef namePrefix, StringRef str,
-                                uint64_t alignment, unsigned addrSpace) {
+LLVM::GlobalOp mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
+                                               Operation *moduleOp, Type llvmI8,
+                                               StringRef namePrefix,
+                                               StringRef str,
+                                               uint64_t alignment,
+                                               unsigned addrSpace) {
   llvm::SmallString<20> nullTermStr(str);
   nullTermStr.push_back('\0'); // Null terminate for C
   auto globalType =
@@ -57,7 +58,7 @@ mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
   StringAttr attr = b.getStringAttr(nullTermStr);
 
   // Try to find existing global.
-  for (auto globalOp : moduleOp.getOps<LLVM::GlobalOp>())
+  for (auto globalOp : moduleOp->getRegion(0).getOps<LLVM::GlobalOp>())
     if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
         globalOp.getValueAttr() == attr &&
         globalOp.getAlignment().value_or(0) == alignment &&
@@ -66,7 +67,7 @@ mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
 
   // Not found: create new global.
   OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointToStart(moduleOp.getBody());
+  b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
   SmallString<16> name = getUniqueSymbolName(moduleOp, namePrefix);
   return LLVM::GlobalOp::create(b, loc, globalType,
                                 /*isConstant=*/true, LLVM::Linkage::Internal,
@@ -396,10 +397,11 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
   auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
   mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
   mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
-  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
-  // This ensures that global constants and declarations are placed within
-  // the device code, not the host code
-  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
+
+  Operation *moduleOp = gpuPrintfOp->getParentWithTrait<OpTrait::SymbolTable>();
+  if (!moduleOp)
+    return rewriter.notifyMatchFailure(gpuPrintfOp,
+                                       "Couldn't find a parent module");
 
   auto ocklBegin =
       getOrDefineFunction(moduleOp, loc, rewriter, "__ockl_printf_begin",
@@ -496,10 +498,10 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
   mlir::Type ptrType =
       LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);
 
-  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
-  // This ensures that global constants and declarations are placed within
-  // the device code, not the host code
-  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
+  Operation *moduleOp = gpuPrintfOp->getParentWithTrait<OpTrait::SymbolTable>();
+  if (!moduleOp)
+    return rewriter.notifyMatchFailure(gpuPrintfOp,
+                                       "Couldn't find a parent module");
 
   auto printfType =
       LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType},
@@ -541,10 +543,10 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
   mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
   mlir::Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
 
-  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
-  // This ensures that global constants and declarations are placed within
-  // the device code, not the host code
-  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
+  Operation *moduleOp = gpuPrintfOp->getParentWithTrait<OpTrait::SymbolTable>();
+  if (!moduleOp)
+    return rewriter.notifyMatchFailure(gpuPrintfOp,
+                                       "Couldn't find a parent module");
 
   // Create a valid global location removing any metadata attached to the
   // location as debug info metadata inside of a function cannot be used outside
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index e17b063..66d3bb4 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -18,15 +18,18 @@ namespace mlir {
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
+/// Note that these functions don't take a `SymbolTable` because GPU module
+/// lowerings can have name collisions as an intermediate state.
+
 /// Find or create an external function declaration in the given module.
-LLVM::LLVMFuncOp getOrDefineFunction(gpu::GPUModuleOp moduleOp, Location loc,
+LLVM::LLVMFuncOp getOrDefineFunction(Operation *moduleOp, Location loc,
                                      OpBuilder &b, StringRef name,
                                      LLVM::LLVMFunctionType type);
 
 /// Create a global that contains the given string. If a global with the same
 /// string already exists in the module, return that global.
 LLVM::GlobalOp getOrCreateStringConstant(OpBuilder &b, Location loc,
-                                         gpu::GPUModuleOp moduleOp, Type llvmI8,
+                                         Operation *moduleOp, Type llvmI8,
                                          StringRef namePrefix, StringRef str,
                                          uint64_t alignment = 0,
                                          unsigned addrSpace = 0);
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index a95263b..852c50c 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -436,7 +436,7 @@ void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
                       LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp,
                       LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp,
                       LLVM::RoundEvenOp, LLVM::RoundOp, LLVM::SinOp,
-                      LLVM::SqrtOp>();
+                      LLVM::SincosOp, LLVM::SqrtOp>();
 
   // TODO: Remove once we support replacing non-root ops.
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
@@ -466,6 +466,100 @@ void mlir::configureGpuToNVVMTypeConverter(LLVMTypeConverter &converter) {
   });
 }
 
+struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
+  using ConvertOpToLLVMPattern<math::SincosOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(math::SincosOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    Value input = adaptor.getOperand();
+    Type inputType = input.getType();
+    auto convertedInput = maybeExt(input, rewriter);
+    auto computeType = convertedInput.getType();
+
+    StringRef sincosFunc;
+    if (isa<Float32Type>(computeType)) {
+      const arith::FastMathFlags flag = op.getFastmath();
+      const bool useApprox =
+          mlir::arith::bitEnumContainsAny(flag, arith::FastMathFlags::afn);
+      sincosFunc = useApprox ? "__nv_fast_sincosf" : "__nv_sincosf";
+    } else if (isa<Float64Type>(computeType)) {
+      sincosFunc = "__nv_sincos";
+    } else {
+      return rewriter.notifyMatchFailure(op,
+                                         "unsupported operand type for sincos");
+    }
+
+    auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
+
+    Value sinPtr, cosPtr;
+    {
+      OpBuilder::InsertionGuard guard(rewriter);
+      auto *scope =
+          op->getParentWithTrait<mlir::OpTrait::AutomaticAllocationScope>();
+      assert(scope && "Expected op to be inside automatic allocation scope");
+      rewriter.setInsertionPointToStart(&scope->getRegion(0).front());
+      auto one = rewriter.create<LLVM::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(1));
+      sinPtr =
+          rewriter.create<LLVM::AllocaOp>(loc, ptrType, computeType, one, 0);
+      cosPtr =
+          rewriter.create<LLVM::AllocaOp>(loc, ptrType, computeType, one, 0);
+    }
+
+    createSincosCall(rewriter, loc, sincosFunc, convertedInput, sinPtr, cosPtr,
+                     op);
+
+    auto sinResult = rewriter.create<LLVM::LoadOp>(loc, computeType, sinPtr);
+    auto cosResult = rewriter.create<LLVM::LoadOp>(loc, computeType, cosPtr);
+
+    rewriter.replaceOp(op, {maybeTrunc(sinResult, inputType, rewriter),
+                            maybeTrunc(cosResult, inputType, rewriter)});
+    return success();
+  }
+
+private:
+  Value maybeExt(Value operand, PatternRewriter &rewriter) const {
+    if (isa<Float16Type, BFloat16Type>(operand.getType()))
+      return rewriter.create<LLVM::FPExtOp>(
+          operand.getLoc(), Float32Type::get(rewriter.getContext()), operand);
+    return operand;
+  }
+
+  Value maybeTrunc(Value operand, Type type, PatternRewriter &rewriter) const {
+    if (operand.getType() != type)
+      return rewriter.create<LLVM::FPTruncOp>(operand.getLoc(), type, operand);
+    return operand;
+  }
+
+  void createSincosCall(ConversionPatternRewriter &rewriter, Location loc,
+                        StringRef funcName, Value input, Value sinPtr,
+                        Value cosPtr, Operation *op) const {
+    auto voidType = LLVM::LLVMVoidType::get(rewriter.getContext());
+    auto ptrType = sinPtr.getType();
+
+    SmallVector<Type> operandTypes = {input.getType(), ptrType, ptrType};
+    auto funcType = LLVM::LLVMFunctionType::get(voidType, operandTypes);
+
+    auto funcAttr = StringAttr::get(op->getContext(), funcName);
+    auto funcOp =
+        SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(op, funcAttr);
+
+    if (!funcOp) {
+      auto parentFunc = op->getParentOfType<FunctionOpInterface>();
+      assert(parentFunc && "expected there to be a parent function");
+      OpBuilder b(parentFunc);
+
+      auto globalloc = loc->findInstanceOfOrUnknown<FileLineColLoc>();
+      funcOp = LLVM::LLVMFuncOp::create(b, globalloc, funcName, funcType);
+    }
+
+    SmallVector<Value> callOperands = {input, sinPtr, cosPtr};
+    rewriter.create<LLVM::CallOp>(loc, funcOp, callOperands);
+  }
+};
+
 template <typename OpTy>
 static void populateOpPatterns(const LLVMTypeConverter &converter,
                                RewritePatternSet &patterns,
@@ -589,6 +683,9 @@ void mlir::populateLibDeviceConversionPatterns(
                                   "__nv_tan", "__nv_fast_tanf");
   populateOpPatterns<math::TanhOp>(converter, patterns, benefit, "__nv_tanhf",
                                    "__nv_tanh");
+
+  // Custom pattern for sincos since it returns two values
+  patterns.add<SincosOpLowering>(converter, benefit);
 }
 
 void mlir::populateGpuToNVVMConversionPatterns(
diff --git a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
index 853f454..229e40e 100644
--- a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
+++ b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
@@ -121,6 +121,38 @@ using CountTrailingZerosOpLowering =
                           LLVM::CountTrailingZerosOp>;
 using AbsIOpLowering = IntOpWithFlagLowering<math::AbsIOp, LLVM::AbsOp>;
 
+// A `sincos` is converted into `llvm.intr.sincos` followed by extractvalue ops.
+struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
+  using ConvertOpToLLVMPattern<math::SincosOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(math::SincosOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    const LLVMTypeConverter &typeConverter = *this->getTypeConverter();
+    mlir::Location loc = op.getLoc();
+    mlir::Type operandType = adaptor.getOperand().getType();
+    mlir::Type llvmOperandType = typeConverter.convertType(operandType);
+    mlir::Type sinType = typeConverter.convertType(op.getSin().getType());
+    mlir::Type cosType = typeConverter.convertType(op.getCos().getType());
+    if (!llvmOperandType || !sinType || !cosType)
+      return failure();
+
+    ConvertFastMath<math::SincosOp, LLVM::SincosOp> attrs(op);
+
+    auto structType = LLVM::LLVMStructType::getLiteral(
+        rewriter.getContext(), {llvmOperandType, llvmOperandType});
+
+    auto sincosOp = rewriter.create<LLVM::SincosOp>(
+        loc, structType, adaptor.getOperand(), attrs.getAttrs());
+
+    auto sinValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 0);
+    auto cosValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 1);
+
+    rewriter.replaceOp(op, {sinValue, cosValue});
+    return success();
+  }
+};
+
 // A `expm1` is converted into `exp - 1`.
 struct ExpM1OpLowering : public ConvertOpToLLVMPattern<math::ExpM1Op> {
   using ConvertOpToLLVMPattern<math::ExpM1Op>::ConvertOpToLLVMPattern;
@@ -393,6 +425,7 @@ void mlir::populateMathToLLVMConversionPatterns(
     RoundEvenOpLowering,
     RoundOpLowering,
     RsqrtOpLowering,
+    SincosOpLowering,
     SinOpLowering,
     SinhOpLowering,
     ASinOpLowering,
diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
index f44552c..a90dcc8 100644
--- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
+++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
@@ -699,6 +699,35 @@ LoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
   return success();
 }
 
+template <typename OpAdaptor>
+static FailureOr<SmallVector<Value>>
+extractLoadCoordsForComposite(memref::LoadOp loadOp, OpAdaptor adaptor,
+                              ConversionPatternRewriter &rewriter) {
+  // At present we only support linear "tiling" as specified in Vulkan, this
+  // means that texels are assumed to be laid out in memory in a row-major
+  // order. This allows us to support any memref layout that is a permutation of
+  // the dimensions. Future work will pass an optional image layout to the
+  // rewrite pattern so that we can support optimized target specific tilings.
+  SmallVector<Value> indices = adaptor.getIndices();
+  AffineMap map = loadOp.getMemRefType().getLayout().getAffineMap();
+  if (!map.isPermutation())
+    return rewriter.notifyMatchFailure(
+        loadOp,
+        "Cannot lower memrefs with memory layout which is not a permutation");
+
+  // The memrefs layout determines the dimension ordering so we need to follow
+  // the map to get the ordering of the dimensions/indices.
+  const unsigned dimCount = map.getNumDims();
+  SmallVector<Value, 3> coords(dimCount);
+  for (unsigned dim = 0; dim < dimCount; ++dim)
+    coords[map.getDimPosition(dim)] = indices[dim];
+
+  // We need to reverse the coordinates because the memref layout is slowest to
+  // fastest moving and the vector coordinates for the image op is fastest to
+  // slowest moving.
+  return llvm::to_vector(llvm::reverse(coords));
+}
+
 LogicalResult
 ImageLoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
                                     ConversionPatternRewriter &rewriter) const {
@@ -755,13 +784,17 @@ ImageLoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
 
   // Build a vector of coordinates or just a scalar index if we have a 1D image.
   Value coords;
-  if (memrefType.getRank() != 1) {
+  if (memrefType.getRank() == 1) {
+    coords = adaptor.getIndices()[0];
+  } else {
+    FailureOr<SmallVector<Value>> maybeCoords =
+        extractLoadCoordsForComposite(loadOp, adaptor, rewriter);
+    if (failed(maybeCoords))
+      return failure();
     auto coordVectorType = VectorType::get({loadOp.getMemRefType().getRank()},
                                            adaptor.getIndices().getType()[0]);
     coords = spirv::CompositeConstructOp::create(rewriter, loc, coordVectorType,
-                                                 adaptor.getIndices());
-  } else {
-    coords = adaptor.getIndices()[0];
+                                                 maybeCoords.value());
   }
 
   // Fetch the value out of the image.
diff --git a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
index 4bdd1e6..127563c 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <utility>
+
 #include "mlir/Dialect/Arith/Transforms/Transforms.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -69,7 +71,8 @@ FailureOr<OpFoldResult> mlir::arith::reifyValueBound(
   AffineMap boundMap;
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeBound(
-          boundMap, mapOperands, type, var, stopCondition, closedUB)))
+          boundMap, mapOperands, type, var, std::move(stopCondition),
+          closedUB)))
     return failure();
 
   // Materialize tensor.dim/memref.dim ops.
@@ -116,7 +119,7 @@ FailureOr<OpFoldResult> mlir::arith::reifyValueBound(
 
 FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
-    int64_t dim, ValueBoundsConstraintSet::StopConditionFn stopCondition,
+    int64_t dim, const ValueBoundsConstraintSet::StopConditionFn &stopCondition,
     bool closedUB) {
   auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
                              ValueBoundsConstraintSet &cstr) {
@@ -134,7 +137,8 @@ FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
 
 FailureOr<OpFoldResult> mlir::arith::reifyIndexValueBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
-    ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
+    const ValueBoundsConstraintSet::StopConditionFn &stopCondition,
+    bool closedUB) {
   auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
                              ValueBoundsConstraintSet &cstr) {
     return v != value;
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 6ee2d86..3f0b0ba 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2799,7 +2799,7 @@ SplitOp::apply(transform::TransformRewriter &rewriter,
     }
 
     opList.append(first);
-    if (second.size())
+    if (!second.empty())
       opList.append(second);
   }
   results.set(cast<OpResult>(getSplitList()), opList);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 15c467b..4919d9a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -287,7 +287,7 @@ private:
   /// moment we only make sure that there are no broadcast dimensions, but this
   /// might change if indexing maps evolve.
   bool isValidMaskingMap(AffineMap maskingMap) {
-    return maskingMap.getBroadcastDims().size() == 0;
+    return maskingMap.getBroadcastDims().empty();
   }
 
   /// Turn the input indexing map into a valid masking map.
@@ -923,7 +923,7 @@ static uint64_t getTrailingNonUnitLoopDimIdx(LinalgOp linalgOp) {
        llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) == 1) &&
       "For statically shaped Linalg Ops, only one "
       "non-unit loop dim is expected");
-  assert(loopRanges.size() != 0 && "Empty loops, nothing to analyse.");
+  assert(!loopRanges.empty() && "Empty loops, nothing to analyse.");
 
   size_t idx = loopRanges.size() - 1;
   for (; idx != 0; idx--)
diff --git a/mlir/lib/Dialect/Math/IR/MathOps.cpp b/mlir/lib/Dialect/Math/IR/MathOps.cpp
index a21631c..bbeef0f 100644
--- a/mlir/lib/Dialect/Math/IR/MathOps.cpp
+++ b/mlir/lib/Dialect/Math/IR/MathOps.cpp
@@ -285,6 +285,16 @@ OpFoldResult math::SinhOp::fold(FoldAdaptor adaptor) {
 }
 
 //===----------------------------------------------------------------------===//
+// SinCosOp getShapeForUnroll
+//===----------------------------------------------------------------------===//
+
+std::optional<SmallVector<int64_t, 4>> math::SincosOp::getShapeForUnroll() {
+  if (auto vt = mlir::dyn_cast<VectorType>(getOperand().getType()))
+    return llvm::to_vector<4>(vt.getShape());
+  return std::nullopt;
+}
+
+//===----------------------------------------------------------------------===//
 // CountLeadingZerosOp folder
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
index 0fa353a..ef35c39 100644
--- a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
+++ b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
@@ -83,29 +83,6 @@ x86vector::DotOp::getIntrinsicOperands(ArrayRef<Value> operands,
   return intrinsicOperands;
 }
 
-SmallVector<Value> x86vector::DotInt8Op::getIntrinsicOperands(
-    ArrayRef<Value> operands, const LLVMTypeConverter &typeConverter,
-    RewriterBase &rewriter) {
-  SmallVector<Value> intrinsicOprnds;
-  Adaptor adaptor(operands, *this);
-  intrinsicOprnds.push_back(adaptor.getW());
-  // Bitcast `a` and `b` to i32
-  Value bitcast_a = LLVM::BitcastOp::create(
-      rewriter, getLoc(),
-      VectorType::get((getA().getType().getShape()[0] / 4),
-                      rewriter.getIntegerType(32)),
-      adaptor.getA());
-  intrinsicOprnds.push_back(bitcast_a);
-  Value bitcast_b = LLVM::BitcastOp::create(
-      rewriter, getLoc(),
-      VectorType::get((getB().getType().getShape()[0] / 4),
-                      rewriter.getIntegerType(32)),
-      adaptor.getB());
-  intrinsicOprnds.push_back(bitcast_b);
-
-  return intrinsicOprnds;
-}
-
 SmallVector<Value> x86vector::BcstToPackedF32Op::getIntrinsicOperands(
     ArrayRef<Value> operands, const LLVMTypeConverter &typeConverter,
     RewriterBase &rewriter) {
diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
index 21eaf28..d072827 100644
--- a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
@@ -328,12 +328,12 @@ struct DynamicEventPool {
   }
 };
 
-L0RTContextWrapper &getRtContext() {
+static L0RTContextWrapper &getRtContext() {
   thread_local static L0RTContextWrapper rtContext(0);
   return rtContext;
 }
 
-DynamicEventPool &getDynamicEventPool() {
+static DynamicEventPool &getDynamicEventPool() {
   thread_local static DynamicEventPool dynEventPool{&getRtContext()};
   return dynEventPool;
 }
@@ -492,8 +492,8 @@ extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
 }
 
 template <typename PATTERN_TYPE>
-void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
-                StreamWrapper *stream) {
+static void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
+                       StreamWrapper *stream) {
   L0RTContextWrapper &rtContext = getRtContext();
   auto listType =
       rtContext.copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE)
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index ef06af3..a4b5dde 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1109,3 +1109,42 @@ gpu.module @test_module_55 {
     func.return %result32, %result64 : f32, f64
   }
 }
+
+gpu.module @test_module_56 {
+  // CHECK: gpu.module @test_module_56
+
+  // CHECK-DAG: llvm.func @__nv_sincosf(f32, !llvm.ptr, !llvm.ptr)
+  // CHECK-DAG: llvm.func @__nv_sincos(f64, !llvm.ptr, !llvm.ptr)
+
+  // CHECK-LABEL: func @gpu_sincos
+  // CHECK-SAME: %[[ARG_f16:.*]]: f16, %[[ARG_f32:.*]]: f32, %[[ARG_f64:.*]]: f64
+  func.func @gpu_sincos(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f16, f32, f32, f64, f64) {
+    // CHECK-COUNT-6: llvm.alloca
+    // CHECK: %[[ARG_f16_ext:.*]] = llvm.fpext %[[ARG_f16]] : f16 to f32
+    // CHECK: llvm.call @__nv_sincosf(%[[ARG_f16_ext]], %{{.+}}, %{{.+}}) : (f32, !llvm.ptr, !llvm.ptr) -> ()
+    // CHECK-COUNT-2: llvm.fptrunc
+    // CHECK: llvm.call @__nv_sincosf(%[[ARG_f32]], %{{.+}}, %{{.+}}) : (f32, !llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @__nv_sincos(%[[ARG_f64]], %{{.+}}, %{{.+}}) : (f64, !llvm.ptr, !llvm.ptr) -> ()
+    %sin16, %cos16 = math.sincos %arg_f16 : f16
+    %sin32, %cos32 = math.sincos %arg_f32 : f32
+    %sin64, %cos64 = math.sincos %arg_f64 : f64
+    func.return %sin16, %cos16, %sin32, %cos32, %sin64, %cos64 : f16, f16, f32, f32, f64, f64
+  }
+
+  // CHECK: llvm.func @__nv_fast_sincosf(f32, !llvm.ptr, !llvm.ptr)
+
+  // CHECK-LABEL: func @gpu_sincos_fastmath
+  // CHECK-SAME: %[[ARG_f16:.*]]: f16, %[[ARG_f32:.*]]: f32, %[[ARG_f64:.*]]: f64
+  func.func @gpu_sincos_fastmath(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f16, f32, f32, f64, f64) {
+    // CHECK-COUNT-6: llvm.alloca
+    // CHECK: %[[ARG_f16_ext:.*]] = llvm.fpext %[[ARG_f16]] : f16 to f32
+    // CHECK: llvm.call @__nv_fast_sincosf(%[[ARG_f16_ext]], %{{.+}}, %{{.+}}) : (f32, !llvm.ptr, !llvm.ptr) -> ()
+    // CHECK-COUNT-2: llvm.fptrunc
+    // CHECK: llvm.call @__nv_fast_sincosf(%[[ARG_f32]], %{{.+}}, %{{.+}}) : (f32, !llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @__nv_sincos(%[[ARG_f64]], %{{.+}}, %{{.+}}) : (f64, !llvm.ptr, !llvm.ptr) -> ()
+    %sin16, %cos16 = math.sincos %arg_f16 fastmath<afn> : f16
+    %sin32, %cos32 = math.sincos %arg_f32 fastmath<afn> : f32
+    %sin64, %cos64 = math.sincos %arg_f64 fastmath<afn> : f64
+    func.return %sin16, %cos16, %sin32, %cos32, %sin64, %cos64 : f16, f16, f32, f32, f64, f64
+  }
+}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
index 2dc6a5a..32da312 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP' -split-input-file | FileCheck %s
 
+// CHECK-LABEL: gpu.module @test_module
 gpu.module @test_module {
   // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00")
   // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00")
@@ -40,3 +41,38 @@ gpu.module @test_module {
     gpu.return
   }
 }
+
+// -----
+
+// The bulitin.module we're targetting is wrapped in a fake gpu.module
+// because the convert-gpu-to-rocdl pass only runs an `gpu.module` ops,
+// even though the printf patterns could run in other contexts.
+
+// CHECK-LABEL: gpu.module @fake_gpu_module_for_test
+// CHECK-LABEL: builtin.module @test_module
+gpu.module @fake_gpu_module_for_test {
+builtin.module @test_module {
+  // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00")
+  // CHECK-DAG: llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
+  // CHECK-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64
+  // CHECK-DAG: llvm.func @__ockl_printf_begin(i64) -> i64
+
+  // CHECK-LABEL: llvm.func @test_printf
+  // CHECK: (%[[ARG0:.*]]: i32)
+  llvm.func @test_printf(%arg0: i32) {
+    // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+    // CHECK-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64
+    // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr
+    // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<11 x i8>
+    // CHECK-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(11 : i64) : i64
+    // CHECK-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64
+    // CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64
+    // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
+    gpu.printf "Hello: %d\n", %arg0 : i32
+    llvm.return
+  }
+}
+}
diff --git a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
index f454122..f7d2712 100644
--- a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
+++ b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
@@ -230,6 +230,16 @@ func.func @trigonometrics(%arg0: f32) {
 
 // -----
 
+// CHECK-LABEL: func @sincos
+// CHECK-SAME: [[ARG0:%.+]]: f32
+func.func @sincos(%arg0: f32) {
+  // CHECK: llvm.intr.sincos([[ARG0]]) : (f32) -> !llvm.struct<(f32, f32)>
+  %0:2 = math.sincos %arg0 : f32
+  func.return
+}
+
+// -----
+
 // CHECK-LABEL: func @inverse_trigonometrics
 // CHECK-SAME: [[ARG0:%.+]]: f32
 func.func @inverse_trigonometrics(%arg0: f32) {
diff --git a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
index e6321e9..ab3c8b7 100644
--- a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
+++ b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
@@ -515,6 +515,12 @@ module attributes {
 
 // Check Image Support.
 
+// CHECK: #[[$COLMAJMAP:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+#col_major = affine_map<(d0, d1) -> (d1, d0)>
+// CHECK: #[[$CUSTOMLAYOUTMAP:.*]] = affine_map<(d0, d1, d2) -> (d2, d1, d0)>
+#custom = affine_map<(d0, d1, d2) -> (d2, d1, d0)>
+// CHECK: #[[$NONPERMMAP:.*]] = affine_map<(d0, d1) -> (d0, d1 mod 2)>
+#non_permutation = affine_map<(d0, d1) -> (d0, d1 mod 2)>
 module attributes {
   spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [
     Shader,
@@ -534,8 +540,8 @@ module attributes {
   // CHECK-LABEL: @load_from_image_1D(
   // CHECK-SAME: %[[ARG0:.*]]: memref<1xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1xf32, #spirv.storage_class<StorageBuffer>>
   func.func @load_from_image_1D(%arg0: memref<1xf32, #spirv.storage_class<Image>>, %arg1: memref<1xf32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim1D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<1xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<1xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim1D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
     %cst = arith.constant 0 : index
     // CHECK: %[[COORDS:.*]] = builtin.unrealized_conversion_cast %{{.*}} : index to i32
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim1D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
@@ -550,121 +556,206 @@ module attributes {
   }
 
   // CHECK-LABEL: @load_from_image_2D(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xf32, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D(%arg0: memref<1x1xf32, #spirv.storage_class<Image>>, %arg1: memref<1x1xf32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x4xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D(%arg0: memref<2x4xf32, #spirv.storage_class<Image>>, %arg1: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<8 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x4xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 3 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 3 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
+    // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
+    // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>, vector<2xi32> -> vector<4xf32>
+    // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf32>
+    %0 = memref.load %arg0[%y, %x] : memref<2x4xf32, #spirv.storage_class<Image>>
+    // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f32
+    memref.store %0, %arg1[%y, %x] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
+    return
+  }
+
+  // CHECK-LABEL: @load_from_col_major_image_2D(
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x4xf32, #[[$COLMAJMAP]], #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_col_major_image_2D(%arg0: memref<2x4xf32, #col_major, #spirv.storage_class<Image>>, %arg1: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<8 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x4xf32, #[[$COLMAJMAP]], #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 3 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 3 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>, vector<2xi32> -> vector<4xf32>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf32>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xf32, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%x, %y] : memref<2x4xf32, #col_major, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f32
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xf32, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_3D(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1x1xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1x1xf32, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_3D(%arg0: memref<1x1x1xf32, #spirv.storage_class<Image>>, %arg1: memref<1x1x1xf32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1x1xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1x1xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3x4xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_3D(%arg0: memref<2x3x4xf32, #spirv.storage_class<Image>>, %arg1: memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<24 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3x4xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 3 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 3 : index
+    // CHECK: %[[Y:.*]] = arith.constant 2 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 2 : index
+    // CHECK: %[[Z:.*]] = arith.constant 1 : index
+    // CHECK: %[[Z32:.*]] = builtin.unrealized_conversion_cast %[[Z]] : index to i32
+    %z = arith.constant 1 : index
+    // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
+    // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]], %[[Z32]] : (i32, i32, i32) -> vector<3xi32>
+    // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>, vector<3xi32> -> vector<4xf32>
+    // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf32>
+    %0 = memref.load %arg0[%z, %y, %x] : memref<2x3x4xf32, #spirv.storage_class<Image>>
+    // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f32
+    memref.store %0, %arg1[%z, %y, %x] : memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>
+    return
+  }
+
+  // CHECK-LABEL: @load_from_custom_layout_image_3D(
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3x4xf32, #[[$CUSTOMLAYOUTMAP]], #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_custom_layout_image_3D(%arg0: memref<2x3x4xf32, #custom,  #spirv.storage_class<Image>>, %arg1: memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<24 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3x4xf32, #[[$CUSTOMLAYOUTMAP]], #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 3 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 3 : index
+    // CHECK: %[[Y:.*]] = arith.constant 2 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 2 : index
+    // CHECK: %[[Z:.*]] = arith.constant 1 : index
+    // CHECK: %[[Z32:.*]] = builtin.unrealized_conversion_cast %[[Z]] : index to i32
+    %z = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> vector<3xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]], %[[Z32]] : (i32, i32, i32) -> vector<3xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>, vector<3xi32> -> vector<4xf32>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf32>
-    %0 = memref.load %arg0[%cst, %cst, %cst] : memref<1x1x1xf32, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%x, %y, %z] : memref<2x3x4xf32, #custom, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f32
-    memref.store %0, %arg1[%cst, %cst, %cst] : memref<1x1x1xf32, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%z, %y, %x] : memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_f16(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xf16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xf16, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_f16(%arg0: memref<1x1xf16, #spirv.storage_class<Image>>, %arg1: memref<1x1xf16, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xf16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f16, stride=2> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xf16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xf16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xf16, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_f16(%arg0: memref<2x3xf16, #spirv.storage_class<Image>>, %arg1: memref<2x3xf16, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xf16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x f16, stride=2> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xf16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>, vector<2xi32> -> vector<4xf16>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf16>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xf16, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xf16, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f16
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xf16, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xf16, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_i32(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xi32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xi32, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_i32(%arg0: memref<1x1xi32, #spirv.storage_class<Image>>, %arg1: memref<1x1xi32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xi32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x i32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xi32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xi32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xi32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_i32(%arg0: memref<2x3xi32, #spirv.storage_class<Image>>, %arg1: memref<2x3xi32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xi32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x i32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xi32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>, vector<2xi32> -> vector<4xi32>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xi32>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xi32, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xi32, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : i32
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xi32, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xi32, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_ui32(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xui32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xui32, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_ui32(%arg0: memref<1x1xui32, #spirv.storage_class<Image>>, %arg1: memref<1x1xui32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xui32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x ui32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xui32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xui32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xui32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_ui32(%arg0: memref<2x3xui32, #spirv.storage_class<Image>>, %arg1: memref<2x3xui32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xui32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x ui32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xui32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>, vector<2xi32> -> vector<4xui32>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xui32>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xui32, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xui32, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : ui32
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xui32, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xui32, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_i16(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xi16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xi16, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_i16(%arg0: memref<1x1xi16, #spirv.storage_class<Image>>, %arg1: memref<1x1xi16, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xi16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x i16, stride=2> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xi16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xi16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xi16, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_i16(%arg0: memref<2x3xi16, #spirv.storage_class<Image>>, %arg1: memref<2x3xi16, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xi16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x i16, stride=2> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xi16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>, vector<2xi32> -> vector<4xi16>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xi16>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xi16, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xi16, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : i16
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xi16, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xi16, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_ui16(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xui16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xui16, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_ui16(%arg0: memref<1x1xui16, #spirv.storage_class<Image>>, %arg1: memref<1x1xui16, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xui16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x ui16, stride=2> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xui16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xui16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xui16, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_ui16(%arg0: memref<2x3xui16, #spirv.storage_class<Image>>, %arg1: memref<2x3xui16, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xui16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x ui16, stride=2> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xui16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>, vector<2xi32> -> vector<4xui16>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xui16>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xui16, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xui16, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : ui16
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xui16, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xui16, #spirv.storage_class<StorageBuffer>>
     return
   }
 
@@ -697,4 +788,15 @@ module attributes {
     memref.store %0, %arg1[%cst] : memref<1xvector<1xf32>, #spirv.storage_class<StorageBuffer>>
     return
   }
+
+  // CHECK-LABEL: @load_non_perm_layout(
+  func.func @load_non_perm_layout(%arg0: memref<2x4xf32, #non_permutation, #spirv.storage_class<Image>>, %arg1: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>) {
+    %x = arith.constant 3 : index
+    %y = arith.constant 1 : index
+    // CHECK-NOT: spirv.Image
+    // CHECK-NOT: spirv.ImageFetch
+    %0 = memref.load %arg0[%y, %x] : memref<2x4xf32, #non_permutation, #spirv.storage_class<Image>>
+    memref.store %0, %arg1[%y, %x] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
+    return
+  }
 }
diff --git a/mlir/test/Dialect/Math/ops.mlir b/mlir/test/Dialect/Math/ops.mlir
index cb10fc4..f085d1c 100644
--- a/mlir/test/Dialect/Math/ops.mlir
+++ b/mlir/test/Dialect/Math/ops.mlir
@@ -62,6 +62,18 @@ func.func @sin(%f: f32, %v: vector<4xf32>, %t: tensor<4x4x?xf32>) {
   return
 }
 
+// CHECK-LABEL: func @sincos(
+// CHECK-SAME:            %[[F:.*]]: f32, %[[V:.*]]: vector<4xf32>, %[[T:.*]]: tensor<4x4x?xf32>)
+func.func @sincos(%f: f32, %v: vector<4xf32>, %t: tensor<4x4x?xf32>) {
+  // CHECK: %{{.*}} = math.sincos %[[F]] : f32
+  %0:2 = math.sincos %f : f32
+  // CHECK: %{{.*}} = math.sincos %[[V]] : vector<4xf32>
+  %1:2 = math.sincos %v : vector<4xf32>
+  // CHECK: %{{.*}} = math.sincos %[[T]] : tensor<4x4x?xf32>
+  %2:2 = math.sincos %t : tensor<4x4x?xf32>
+  return
+}
+
 // CHECK-LABEL: func @erf(
 // CHECK-SAME:            %[[F:.*]]: f32, %[[V:.*]]: vector<4xf32>, %[[T:.*]]: tensor<4x4x?xf32>)
 func.func @erf(%f: f32, %v: vector<4xf32>, %t: tensor<4x4x?xf32>) {