rebaseusers/fmayer/sprscudo-allow-to-resize-allocation-ring-buffer

Created using spr 1.3.4
author: Florian Mayer <fmayer@google.com> 2024-02-23 11:31:14 -0800
committer: Florian Mayer <fmayer@google.com> 2024-02-23 11:31:14 -0800
commit: 886b4bc97b0ed5a5e041a0117a584182fc7989c1 (patch)
tree: 43cdc0e15e12c298c09251dda38e834e7e778049 /llvm
parent: af8afe08ee20a04b2ccb363cac66aa02cfaecd02 (diff)
parent: 8d536f83545f071948888983e2db25ce23a8302d (diff)
download: llvm-886b4bc97b0ed5a5e041a0117a584182fc7989c1.zip
llvm-886b4bc97b0ed5a5e041a0117a584182fc7989c1.tar.gz
llvm-886b4bc97b0ed5a5e041a0117a584182fc7989c1.tar.bz2
305 files changed, 29984 insertions, 7650 deletions
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 98cef00..f5f7d3f 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -168,7 +168,20 @@ foreach(proj IN LISTS LLVM_ENABLE_RUNTIMES)
   endif()
 endforeach()
 
-if ("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
+set(NEED_LIBC_HDRGEN FALSE)
+if("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
+  set(NEED_LIBC_HDRGEN TRUE)
+else()
+  foreach(_name ${LLVM_RUNTIME_TARGETS})
+    if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES)
+      set(NEED_LIBC_HDRGEN TRUE)
+      if("${_name}" STREQUAL "amdgcn-amd-amdhsa" OR "${_name}" STREQUAL "nvptx64-nvidia-cuda")
+        set(LLVM_LIBC_GPU_BUILD ON)
+      endif()
+    endif()
+  endforeach()
+endif()
+if(NEED_LIBC_HDRGEN)
   # To build the libc runtime, we need to be able to build few libc build
   # tools from the "libc" project. So, we add it to the list of enabled
   # projects.
@@ -177,6 +190,7 @@ if ("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
     list(APPEND LLVM_ENABLE_PROJECTS "libc")
   endif()
 endif()
+unset(NEED_LIBC_HDRGEN)
 
 # LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the
 # `LLVM_ENABLE_PROJECTS` CMake cache variable.  This exists for
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 486df22..08ff49d 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -120,7 +120,19 @@ if( LLVM_ENABLE_ASSERTIONS )
   endif()
 endif()
 
+# If we are targeting a GPU architecture in a runtimes build we want to ignore
+# all the standard flag handling.
+if("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn" OR
+   "${LLVM_RUNTIMES_TARGET}" MATCHES "^nvptx64")
+  return()
+endif()
+
 if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+  # When LLVM_ENABLE_EXPENSIVE_CHECKS is ON, LLVM will intercept errors
+  # using assert(). An explicit check is performed here.
+  if (NOT LLVM_ENABLE_ASSERTIONS)
+    message(FATAL_ERROR "LLVM_ENABLE_EXPENSIVE_CHECKS requires LLVM_ENABLE_ASSERTIONS \"ON\".")
+  endif()
   add_compile_definitions(EXPENSIVE_CHECKS)
 
   # In some libstdc++ versions, std::min_element is not constexpr when
diff --git a/llvm/docs/CommandGuide/llvm-readelf.rst b/llvm/docs/CommandGuide/llvm-readelf.rst
index 6ee4a5d..675628f 100644
--- a/llvm/docs/CommandGuide/llvm-readelf.rst
+++ b/llvm/docs/CommandGuide/llvm-readelf.rst
@@ -38,6 +38,11 @@ OPTIONS
  Display the contents of the basic block address map section(s), which contain the
  address of each function, along with the relative offset of each basic block.
 
+.. option:: --decompress, -z
+
+  Dump decompressed section content when used with ``-x`` or ``-p``.
+  If the section(s) are not compressed, they are displayed as is.
+
 .. option:: --demangle, -C
 
  Display demangled symbol names in the output.
diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst
index cb9232e..6d78a03 100644
--- a/llvm/docs/CommandGuide/llvm-readobj.rst
+++ b/llvm/docs/CommandGuide/llvm-readobj.rst
@@ -56,6 +56,11 @@ file formats.
 
  Display the address-significance table.
 
+.. option:: --decompress, -z
+
+  Dump decompressed section content when used with ``-x`` or ``-p``.
+  If the section(s) are not compressed, they are displayed as is.
+
 .. option:: --expand-relocs
 
  When used with :option:`--relocs`, display each relocation in an expanded
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index fd2e3aa..19ca9f6 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -61,10 +61,13 @@ run by the parser after parsing input assembly and by the optimizer
 before it outputs bitcode. The violations pointed out by the verifier
 pass indicate bugs in transformation passes or input to the parser.
 
+Syntax
+======
+
 .. _identifiers:
 
 Identifiers
-===========
+-----------
 
 LLVM identifiers come in two basic types: global and local. Global
 identifiers (functions, global variables) begin with the ``'@'``
@@ -140,6 +143,34 @@ It also shows a convention that we follow in this document. When
 demonstrating instructions, we will follow an instruction with a comment
 that defines the type and name of value produced.
 
+.. _strings:
+
+String constants
+----------------
+
+Strings in LLVM programs are delimited by ``"`` characters. Within a
+string, all bytes are treated literally with the exception of ``\``
+characters, which start escapes, and the first ``"`` character, which
+ends the string.
+
+There are two kinds of escapes.
+
+* ``\\`` represents a single ``\`` character.
+
+* ``\`` followed by two hexadecimal characters (0-9, a-f, or A-F)
+  represents the byte with the given value (e.g. \x00 represents a
+  null byte).
+
+To represent a ``"`` character, use ``\22``. (``\"`` will end the string
+with a trailing ``\``.)
+
+Newlines do not terminate string constants; strings can span multiple
+lines.
+
+The interpretation of string constants (e.g. their character encoding)
+depends on context.
+
+
 High Level Structure
 ====================
 
@@ -16718,6 +16749,7 @@ an operation is greater than the maximum value, the result is set (or
 "clamped") to this maximum. If it is below the minimum, it is clamped to this
 minimum.
 
+.. _int_sadd_sat:
 
 '``llvm.sadd.sat.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -16767,6 +16799,8 @@ Examples
       %res = call i4 @llvm.sadd.sat.i4(i4 -4, i4 -5)  ; %res = -8
 
 
+.. _int_uadd_sat:
+
 '``llvm.uadd.sat.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16814,6 +16848,8 @@ Examples
       %res = call i4 @llvm.uadd.sat.i4(i4 8, i4 8)  ; %res = 15
 
 
+.. _int_ssub_sat:
+
 '``llvm.ssub.sat.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16862,6 +16898,8 @@ Examples
       %res = call i4 @llvm.ssub.sat.i4(i4 4, i4 -5)  ; %res = 7
 
 
+.. _int_usub_sat:
+
 '``llvm.usub.sat.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -23579,6 +23617,202 @@ Examples:
       %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
 
 
+.. _int_vp_sadd_sat:
+
+'``llvm.vp.sadd.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.sadd.sat.v16i32 (<16 x i32> <left_op> <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.sadd.sat.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.sadd.sat.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated signed saturating addition of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.sadd.sat``' intrinsic performs sadd.sat (:ref:`sadd.sat <int_sadd_sat>`)
+of the first and second vector operands on each enabled lane. The result on
+disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
+.. _int_vp_uadd_sat:
+
+'``llvm.vp.uadd.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.uadd.sat.v16i32 (<16 x i32> <left_op> <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.uadd.sat.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.uadd.sat.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated unsigned saturating addition of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.uadd.sat``' intrinsic performs uadd.sat (:ref:`uadd.sat <int_uadd_sat>`)
+of the first and second vector operands on each enabled lane. The result on
+disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
+.. _int_vp_ssub_sat:
+
+'``llvm.vp.ssub.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.ssub.sat.v16i32 (<16 x i32> <left_op> <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.ssub.sat.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.ssub.sat.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated signed saturating subtraction of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.ssub.sat``' intrinsic performs ssub.sat (:ref:`ssub.sat <int_ssub_sat>`)
+of the first and second vector operands on each enabled lane. The result on
+disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
+.. _int_vp_usub_sat:
+
+'``llvm.vp.usub.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.usub.sat.v16i32 (<16 x i32> <left_op> <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.usub.sat.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.usub.sat.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated unsigned saturating subtraction of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.usub.sat``' intrinsic performs usub.sat (:ref:`usub.sat <int_usub_sat>`)
+of the first and second vector operands on each enabled lane. The result on
+disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
 .. _int_vp_fshl:
 
 '``llvm.vp.fshl.*``' Intrinsics
diff --git a/llvm/include/llvm/ADT/iterator_range.h b/llvm/include/llvm/ADT/iterator_range.h
index 2dc2279..7d288ea 100644
--- a/llvm/include/llvm/ADT/iterator_range.h
+++ b/llvm/include/llvm/ADT/iterator_range.h
@@ -43,8 +43,8 @@ class iterator_range {
   IteratorT begin_iterator, end_iterator;
 
 public:
-#if __GNUC__ == 7
-  // Be careful no to break gcc-7 on the mlir target.
+#if __GNUC__ == 7 || (__GNUC__ == 8 && __GNUC_MINOR__ < 4)
+  // Be careful no to break gcc-7 and gcc-8 < 8.4 on the mlir target.
   // See https://github.com/llvm/llvm-project/issues/63843
   template <typename Container>
 #else
diff --git a/llvm/include/llvm/CodeGen/MachinePassManager.h b/llvm/include/llvm/CodeGen/MachinePassManager.h
index a0ad7d7..7713c55 100644
--- a/llvm/include/llvm/CodeGen/MachinePassManager.h
+++ b/llvm/include/llvm/CodeGen/MachinePassManager.h
@@ -25,17 +25,18 @@
 
 #include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PassManagerInternal.h"
 #include "llvm/Support/Error.h"
 
-#include <map>
-
 namespace llvm {
 class Module;
 class Function;
 class MachineFunction;
 
 extern template class AnalysisManager<MachineFunction>;
+using MachineFunctionAnalysisManager = AnalysisManager<MachineFunction>;
 
 /// A CRTP mix-in that provides informational APIs needed for machine passes.
 ///
@@ -46,217 +47,247 @@ struct MachinePassInfoMixin : public PassInfoMixin<DerivedT> {
   // TODO: Add MachineFunctionProperties support.
 };
 
-/// An AnalysisManager<MachineFunction> that also exposes IR analysis results.
-class MachineFunctionAnalysisManager : public AnalysisManager<MachineFunction> {
-public:
-  using Base = AnalysisManager<MachineFunction>;
+namespace detail {
+struct MachinePassConcept
+    : PassConcept<MachineFunction, MachineFunctionAnalysisManager> {
+  virtual MachineFunctionProperties getRequiredProperties() const = 0;
+  virtual MachineFunctionProperties getSetProperties() const = 0;
+  virtual MachineFunctionProperties getClearedProperties() const = 0;
+};
 
-  MachineFunctionAnalysisManager() : FAM(nullptr), MAM(nullptr) {}
-  MachineFunctionAnalysisManager(FunctionAnalysisManager &FAM,
-                                 ModuleAnalysisManager &MAM)
-      : FAM(&FAM), MAM(&MAM) {}
-  MachineFunctionAnalysisManager(MachineFunctionAnalysisManager &&) = default;
-  MachineFunctionAnalysisManager &
-  operator=(MachineFunctionAnalysisManager &&) = default;
+template <typename PassT> struct MachinePassModel : MachinePassConcept {
+  explicit MachinePassModel(PassT Pass) : Pass(std::move(Pass)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  MachinePassModel(const MachinePassModel &Arg) : Pass(Arg.Pass) {}
+  MachinePassModel(MachinePassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
 
-  /// Get the result of an analysis pass for a Function.
-  ///
-  /// Runs the analysis if a cached result is not available.
-  template <typename PassT> typename PassT::Result &getResult(Function &F) {
-    return FAM->getResult<PassT>(F);
+  friend void swap(MachinePassModel &LHS, MachinePassModel &RHS) {
+    using std::swap;
+    swap(LHS.Pass, RHS.Pass);
   }
 
-  /// Get the cached result of an analysis pass for a Function.
-  ///
-  /// This method never runs the analysis.
-  ///
-  /// \returns null if there is no cached result.
-  template <typename PassT>
-  typename PassT::Result *getCachedResult(Function &F) {
-    return FAM->getCachedResult<PassT>(F);
+  MachinePassModel &operator=(MachinePassModel RHS) {
+    swap(*this, RHS);
+    return *this;
   }
 
-  /// Get the result of an analysis pass for a Module.
-  ///
-  /// Runs the analysis if a cached result is not available.
-  template <typename PassT> typename PassT::Result &getResult(Module &M) {
-    return MAM->getResult<PassT>(M);
+  PreservedAnalyses run(MachineFunction &IR,
+                        MachineFunctionAnalysisManager &AM) override {
+    return Pass.run(IR, AM);
   }
 
-  /// Get the cached result of an analysis pass for a Module.
-  ///
-  /// This method never runs the analysis.
-  ///
-  /// \returns null if there is no cached result.
-  template <typename PassT> typename PassT::Result *getCachedResult(Module &M) {
-    return MAM->getCachedResult<PassT>(M);
+  void printPipeline(
+      raw_ostream &OS,
+      function_ref<StringRef(StringRef)> MapClassName2PassName) override {
+    Pass.printPipeline(OS, MapClassName2PassName);
   }
 
-  /// Get the result of an analysis pass for a MachineFunction.
-  ///
-  /// Runs the analysis if a cached result is not available.
-  using Base::getResult;
+  StringRef name() const override { return PassT::name(); }
 
-  /// Get the cached result of an analysis pass for a MachineFunction.
-  ///
-  /// This method never runs the analysis.
-  ///
-  /// returns null if there is no cached result.
-  using Base::getCachedResult;
-
-  // FIXME: Add LoopAnalysisManager or CGSCCAnalysisManager if needed.
-  FunctionAnalysisManager *FAM;
-  ModuleAnalysisManager *MAM;
-};
+  template <typename T>
+  using has_required_t = decltype(std::declval<T &>().isRequired());
+  template <typename T>
+  static std::enable_if_t<is_detected<has_required_t, T>::value, bool>
+  passIsRequiredImpl() {
+    return T::isRequired();
+  }
+  template <typename T>
+  static std::enable_if_t<!is_detected<has_required_t, T>::value, bool>
+  passIsRequiredImpl() {
+    return false;
+  }
+  bool isRequired() const override { return passIsRequiredImpl<PassT>(); }
+
+  template <typename T>
+  using has_get_required_properties_t =
+      decltype(std::declval<T &>().getRequiredProperties());
+  template <typename T>
+  static std::enable_if_t<is_detected<has_get_required_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getRequiredPropertiesImpl() {
+    return PassT::getRequiredProperties();
+  }
+  template <typename T>
+  static std::enable_if_t<!is_detected<has_get_required_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getRequiredPropertiesImpl() {
+    return MachineFunctionProperties();
+  }
+  MachineFunctionProperties getRequiredProperties() const override {
+    return getRequiredPropertiesImpl<PassT>();
+  }
 
-extern template class PassManager<MachineFunction>;
+  template <typename T>
+  using has_get_set_properties_t =
+      decltype(std::declval<T &>().getSetProperties());
+  template <typename T>
+  static std::enable_if_t<is_detected<has_get_set_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getSetPropertiesImpl() {
+    return PassT::getSetProperties();
+  }
+  template <typename T>
+  static std::enable_if_t<!is_detected<has_get_set_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getSetPropertiesImpl() {
+    return MachineFunctionProperties();
+  }
+  MachineFunctionProperties getSetProperties() const override {
+    return getSetPropertiesImpl<PassT>();
+  }
 
-/// MachineFunctionPassManager adds/removes below features to/from the base
-/// PassManager template instantiation.
-///
-/// - Support passes that implement doInitialization/doFinalization. This is for
-///   machine function passes to work on module level constructs. One such pass
-///   is AsmPrinter.
-///
-/// - Support machine module pass which runs over the module (for example,
-///   MachineOutliner). A machine module pass needs to define the method:
-///
-///   ```Error run(Module &, MachineFunctionAnalysisManager &)```
-///
-///   FIXME: machine module passes still need to define the usual machine
-///          function pass interface, namely,
-///          `PreservedAnalyses run(MachineFunction &,
-///                                 MachineFunctionAnalysisManager &)`
-///          But this interface wouldn't be executed. It is just a placeholder
-///          to satisfy the pass manager type-erased inteface. This
-///          special-casing of machine module pass is due to its limited use
-///          cases and the unnecessary complexity it may bring to the machine
-///          pass manager.
-///
-/// - The base class `run` method is replaced by an alternative `run` method.
-///   See details below.
-///
-/// - Support codegening in the SCC order. Users include interprocedural
-///   register allocation (IPRA).
-class MachineFunctionPassManager
-    : public PassManager<MachineFunction, MachineFunctionAnalysisManager> {
-  using Base = PassManager<MachineFunction, MachineFunctionAnalysisManager>;
+  template <typename T>
+  using has_get_cleared_properties_t =
+      decltype(std::declval<T &>().getClearedProperties());
+  template <typename T>
+  static std::enable_if_t<is_detected<has_get_cleared_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getClearedPropertiesImpl() {
+    return PassT::getClearedProperties();
+  }
+  template <typename T>
+  static std::enable_if_t<!is_detected<has_get_cleared_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getClearedPropertiesImpl() {
+    return MachineFunctionProperties();
+  }
+  MachineFunctionProperties getClearedProperties() const override {
+    return getClearedPropertiesImpl<PassT>();
+  }
 
+  PassT Pass;
+};
+} // namespace detail
+
+using MachineFunctionAnalysisManagerModuleProxy =
+    InnerAnalysisManagerProxy<MachineFunctionAnalysisManager, Module>;
+
+template <>
+bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate(
+    Module &M, const PreservedAnalyses &PA,
+    ModuleAnalysisManager::Invalidator &Inv);
+extern template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
+                                                Module>;
+
+extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
+                                                MachineFunction>;
+/// Provide the \c ModuleAnalysisManager to \c Function proxy.
+using ModuleAnalysisManagerMachineFunctionProxy =
+    OuterAnalysisManagerProxy<ModuleAnalysisManager, MachineFunction>;
+
+class FunctionAnalysisManagerMachineFunctionProxy
+    : public AnalysisInfoMixin<FunctionAnalysisManagerMachineFunctionProxy> {
 public:
-  MachineFunctionPassManager(bool RequireCodeGenSCCOrder = false,
-                             bool VerifyMachineFunction = false)
-      : RequireCodeGenSCCOrder(RequireCodeGenSCCOrder),
-        VerifyMachineFunction(VerifyMachineFunction) {}
-  MachineFunctionPassManager(MachineFunctionPassManager &&) = default;
-  MachineFunctionPassManager &
-  operator=(MachineFunctionPassManager &&) = default;
-
-  /// Run machine passes for a Module.
+  class Result {
+  public:
+    explicit Result(FunctionAnalysisManager &FAM) : FAM(&FAM) {}
+
+    Result(Result &&Arg) : FAM(std::move(Arg.FAM)) {
+      // We have to null out the analysis manager in the moved-from state
+      // because we are taking ownership of the responsibilty to clear the
+      // analysis state.
+      Arg.FAM = nullptr;
+    }
+
+    ~Result() {
+      // FAM is cleared in a moved from state where there is nothing to do.
+      if (!FAM)
+        return;
+
+      // Clear out the analysis manager if we're being destroyed -- it means we
+      // didn't even see an invalidate call when we got invalidated.
+      FAM->clear();
+    }
+
+    Result &operator=(Result &&RHS) {
+      FAM = RHS.FAM;
+      // We have to null out the analysis manager in the moved-from state
+      // because we are taking ownership of the responsibilty to clear the
+      // analysis state.
+      RHS.FAM = nullptr;
+      return *this;
+    }
+
+    /// Accessor for the analysis manager.
+    FunctionAnalysisManager &getManager() { return *FAM; }
+
+    /// Handler for invalidation of the outer IR unit, \c IRUnitT.
+    ///
+    /// If the proxy analysis itself is not preserved, we assume that the set of
+    /// inner IR objects contained in IRUnit may have changed.  In this case,
+    /// we have to call \c clear() on the inner analysis manager, as it may now
+    /// have stale pointers to its inner IR objects.
+    ///
+    /// Regardless of whether the proxy analysis is marked as preserved, all of
+    /// the analyses in the inner analysis manager are potentially invalidated
+    /// based on the set of preserved analyses.
+    bool invalidate(MachineFunction &IR, const PreservedAnalyses &PA,
+                    MachineFunctionAnalysisManager::Invalidator &Inv);
+
+  private:
+    FunctionAnalysisManager *FAM;
+  };
+
+  explicit FunctionAnalysisManagerMachineFunctionProxy(
+      FunctionAnalysisManager &FAM)
+      : FAM(&FAM) {}
+
+  /// Run the analysis pass and create our proxy result object.
   ///
-  /// The intended use is to start the codegen pipeline for a Module. The base
-  /// class's `run` method is deliberately hidden by this due to the observation
-  /// that we don't yet have the use cases of compositing two instances of
-  /// machine pass managers, or compositing machine pass managers with other
-  /// types of pass managers.
-  Error run(Module &M, MachineFunctionAnalysisManager &MFAM);
-
-  template <typename PassT> void addPass(PassT &&Pass) {
-    Base::addPass(std::forward<PassT>(Pass));
-    PassConceptT *P = Passes.back().get();
-    addDoInitialization<PassT>(P);
-    addDoFinalization<PassT>(P);
-
-    // Add machine module pass.
-    addRunOnModule<PassT>(P);
+  /// This doesn't do any interesting work; it is primarily used to insert our
+  /// proxy result object into the outer analysis cache so that we can proxy
+  /// invalidation to the inner analysis manager.
+  Result run(MachineFunction &, MachineFunctionAnalysisManager &) {
+    return Result(*FAM);
   }
 
-private:
-  template <typename PassT>
-  using has_init_t = decltype(std::declval<PassT &>().doInitialization(
-      std::declval<Module &>(),
-      std::declval<MachineFunctionAnalysisManager &>()));
-
-  template <typename PassT>
-  std::enable_if_t<!is_detected<has_init_t, PassT>::value>
-  addDoInitialization(PassConceptT *Pass) {}
-
-  template <typename PassT>
-  std::enable_if_t<is_detected<has_init_t, PassT>::value>
-  addDoInitialization(PassConceptT *Pass) {
-    using PassModelT = detail::PassModel<MachineFunction, PassT,
-                                         MachineFunctionAnalysisManager>;
-    auto *P = static_cast<PassModelT *>(Pass);
-    InitializationFuncs.emplace_back(
-        [=](Module &M, MachineFunctionAnalysisManager &MFAM) {
-          return P->Pass.doInitialization(M, MFAM);
-        });
-  }
+  static AnalysisKey Key;
 
-  template <typename PassT>
-  using has_fini_t = decltype(std::declval<PassT &>().doFinalization(
-      std::declval<Module &>(),
-      std::declval<MachineFunctionAnalysisManager &>()));
-
-  template <typename PassT>
-  std::enable_if_t<!is_detected<has_fini_t, PassT>::value>
-  addDoFinalization(PassConceptT *Pass) {}
-
-  template <typename PassT>
-  std::enable_if_t<is_detected<has_fini_t, PassT>::value>
-  addDoFinalization(PassConceptT *Pass) {
-    using PassModelT = detail::PassModel<MachineFunction, PassT,
-                                         MachineFunctionAnalysisManager>;
-    auto *P = static_cast<PassModelT *>(Pass);
-    FinalizationFuncs.emplace_back(
-        [=](Module &M, MachineFunctionAnalysisManager &MFAM) {
-          return P->Pass.doFinalization(M, MFAM);
-        });
-  }
+private:
+  FunctionAnalysisManager *FAM;
+};
 
-  template <typename PassT>
-  using is_machine_module_pass_t = decltype(std::declval<PassT &>().run(
-      std::declval<Module &>(),
-      std::declval<MachineFunctionAnalysisManager &>()));
-
-  template <typename PassT>
-  using is_machine_function_pass_t = decltype(std::declval<PassT &>().run(
-      std::declval<MachineFunction &>(),
-      std::declval<MachineFunctionAnalysisManager &>()));
-
-  template <typename PassT>
-  std::enable_if_t<!is_detected<is_machine_module_pass_t, PassT>::value>
-  addRunOnModule(PassConceptT *Pass) {}
-
-  template <typename PassT>
-  std::enable_if_t<is_detected<is_machine_module_pass_t, PassT>::value>
-  addRunOnModule(PassConceptT *Pass) {
-    static_assert(is_detected<is_machine_function_pass_t, PassT>::value,
-                  "machine module pass needs to define machine function pass "
-                  "api. sorry.");
-
-    using PassModelT = detail::PassModel<MachineFunction, PassT,
-                                         MachineFunctionAnalysisManager>;
-    auto *P = static_cast<PassModelT *>(Pass);
-    MachineModulePasses.emplace(
-        Passes.size() - 1,
-        [=](Module &M, MachineFunctionAnalysisManager &MFAM) {
-          return P->Pass.run(M, MFAM);
-        });
-  }
+class ModuleToMachineFunctionPassAdaptor
+    : public PassInfoMixin<ModuleToMachineFunctionPassAdaptor> {
+  using MachinePassConcept = detail::MachinePassConcept;
 
-  using FuncTy = Error(Module &, MachineFunctionAnalysisManager &);
-  SmallVector<llvm::unique_function<FuncTy>, 4> InitializationFuncs;
-  SmallVector<llvm::unique_function<FuncTy>, 4> FinalizationFuncs;
+public:
+  explicit ModuleToMachineFunctionPassAdaptor(
+      std::unique_ptr<MachinePassConcept> Pass)
+      : Pass(std::move(Pass)) {}
 
-  using PassIndex = decltype(Passes)::size_type;
-  std::map<PassIndex, llvm::unique_function<FuncTy>> MachineModulePasses;
+  /// Runs the function pass across every function in the module.
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 
-  // Run codegen in the SCC order.
-  bool RequireCodeGenSCCOrder;
+  static bool isRequired() { return true; }
 
-  bool VerifyMachineFunction;
+private:
+  std::unique_ptr<MachinePassConcept> Pass;
 };
 
+template <typename MachineFunctionPassT>
+ModuleToMachineFunctionPassAdaptor
+createModuleToMachineFunctionPassAdaptor(MachineFunctionPassT &&Pass) {
+  using PassModelT = detail::MachinePassModel<MachineFunctionPassT>;
+  // Do not use make_unique, it causes too many template instantiations,
+  // causing terrible compile times.
+  return ModuleToMachineFunctionPassAdaptor(
+      std::unique_ptr<detail::MachinePassConcept>(
+          new PassModelT(std::forward<MachineFunctionPassT>(Pass))));
+}
+
+template <>
+PreservedAnalyses
+PassManager<MachineFunction>::run(MachineFunction &,
+                                  AnalysisManager<MachineFunction> &);
+extern template class PassManager<MachineFunction>;
+
+/// Convenience typedef for a pass manager over functions.
+using MachineFunctionPassManager = PassManager<MachineFunction>;
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINEPASSMANAGER_H
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
new file mode 100644
index 0000000..412bf42
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -0,0 +1,694 @@
+//==--------------- llvm/CodeGen/SDPatternMatch.h ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Contains matchers for matching SelectionDAG nodes and values.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SDPATTERNMATCH_H
+#define LLVM_CODEGEN_SDPATTERNMATCH_H
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+namespace SDPatternMatch {
+
+/// MatchContext can repurpose existing patterns to behave differently under
+/// a certain context. For instance, `m_Opc(ISD::ADD)` matches plain ADD nodes
+/// in normal circumstances, but matches VP_ADD nodes under a custom
+/// VPMatchContext. This design is meant to facilitate code / pattern reusing.
+class BasicMatchContext {
+  const SelectionDAG *DAG;
+  const TargetLowering *TLI;
+
+public:
+  explicit BasicMatchContext(const SelectionDAG *DAG)
+      : DAG(DAG), TLI(DAG ? &DAG->getTargetLoweringInfo() : nullptr) {}
+
+  explicit BasicMatchContext(const TargetLowering *TLI)
+      : DAG(nullptr), TLI(TLI) {}
+
+  // A valid MatchContext has to implement the following functions.
+
+  const SelectionDAG *getDAG() const { return DAG; }
+
+  const TargetLowering *getTLI() const { return TLI; }
+
+  /// Return true if N effectively has opcode Opcode.
+  bool match(SDValue N, unsigned Opcode) const {
+    return N->getOpcode() == Opcode;
+  }
+};
+
+template <typename Pattern, typename MatchContext>
+[[nodiscard]] bool sd_context_match(SDValue N, const MatchContext &Ctx,
+                                    Pattern &&P) {
+  return P.match(Ctx, N);
+}
+
+template <typename Pattern, typename MatchContext>
+[[nodiscard]] bool sd_context_match(SDNode *N, const MatchContext &Ctx,
+                                    Pattern &&P) {
+  return sd_context_match(SDValue(N, 0), Ctx, P);
+}
+
+template <typename Pattern>
+[[nodiscard]] bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P) {
+  return sd_context_match(N, BasicMatchContext(DAG), P);
+}
+
+template <typename Pattern>
+[[nodiscard]] bool sd_match(SDValue N, const SelectionDAG *DAG, Pattern &&P) {
+  return sd_context_match(N, BasicMatchContext(DAG), P);
+}
+
+template <typename Pattern>
+[[nodiscard]] bool sd_match(SDNode *N, Pattern &&P) {
+  return sd_match(N, nullptr, P);
+}
+
+template <typename Pattern>
+[[nodiscard]] bool sd_match(SDValue N, Pattern &&P) {
+  return sd_match(N, nullptr, P);
+}
+
+// === Utilities ===
+struct Value_match {
+  SDValue MatchVal;
+
+  Value_match() = default;
+
+  explicit Value_match(SDValue Match) : MatchVal(Match) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    if (MatchVal)
+      return MatchVal == N;
+    return N.getNode();
+  }
+};
+
+/// Match any valid SDValue.
+inline Value_match m_Value() { return Value_match(); }
+
+inline Value_match m_Specific(SDValue N) {
+  assert(N);
+  return Value_match(N);
+}
+
+struct DeferredValue_match {
+  SDValue &MatchVal;
+
+  explicit DeferredValue_match(SDValue &Match) : MatchVal(Match) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    return N == MatchVal;
+  }
+};
+
+/// Similar to m_Specific, but the specific value to match is determined by
+/// another sub-pattern in the same sd_match() expression. For instance,
+/// We cannot match `(add V, V)` with `m_Add(m_Value(X), m_Specific(X))` since
+/// `X` is not initialized at the time it got copied into `m_Specific`. Instead,
+/// we should use `m_Add(m_Value(X), m_Deferred(X))`.
+inline DeferredValue_match m_Deferred(SDValue &V) {
+  return DeferredValue_match(V);
+}
+
+struct Opcode_match {
+  unsigned Opcode;
+
+  explicit Opcode_match(unsigned Opc) : Opcode(Opc) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    return Ctx.match(N, Opcode);
+  }
+};
+
+inline Opcode_match m_Opc(unsigned Opcode) { return Opcode_match(Opcode); }
+
+template <unsigned NumUses, typename Pattern> struct NUses_match {
+  Pattern P;
+
+  explicit NUses_match(const Pattern &P) : P(P) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    // SDNode::hasNUsesOfValue is pretty expensive when the SDNode produces
+    // multiple results, hence we check the subsequent pattern here before
+    // checking the number of value users.
+    return P.match(Ctx, N) && N->hasNUsesOfValue(NumUses, N.getResNo());
+  }
+};
+
+template <typename Pattern>
+inline NUses_match<1, Pattern> m_OneUse(const Pattern &P) {
+  return NUses_match<1, Pattern>(P);
+}
+template <unsigned N, typename Pattern>
+inline NUses_match<N, Pattern> m_NUses(const Pattern &P) {
+  return NUses_match<N, Pattern>(P);
+}
+
+inline NUses_match<1, Value_match> m_OneUse() {
+  return NUses_match<1, Value_match>(m_Value());
+}
+template <unsigned N> inline NUses_match<N, Value_match> m_NUses() {
+  return NUses_match<N, Value_match>(m_Value());
+}
+
+struct Value_bind {
+  SDValue &BindVal;
+
+  explicit Value_bind(SDValue &N) : BindVal(N) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    BindVal = N;
+    return true;
+  }
+};
+
+inline Value_bind m_Value(SDValue &N) { return Value_bind(N); }
+
+template <typename Pattern, typename PredFuncT> struct TLI_pred_match {
+  Pattern P;
+  PredFuncT PredFunc;
+
+  TLI_pred_match(const PredFuncT &Pred, const Pattern &P)
+      : P(P), PredFunc(Pred) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    assert(Ctx.getTLI() && "TargetLowering is required for this pattern.");
+    return PredFunc(*Ctx.getTLI(), N) && P.match(Ctx, N);
+  }
+};
+
+// Explicit deduction guide.
+template <typename PredFuncT, typename Pattern>
+TLI_pred_match(const PredFuncT &Pred, const Pattern &P)
+    -> TLI_pred_match<Pattern, PredFuncT>;
+
+/// Match legal SDNodes based on the information provided by TargetLowering.
+template <typename Pattern> inline auto m_LegalOp(const Pattern &P) {
+  return TLI_pred_match{[](const TargetLowering &TLI, SDValue N) {
+                          return TLI.isOperationLegal(N->getOpcode(),
+                                                      N.getValueType());
+                        },
+                        P};
+}
+
+/// Switch to a different MatchContext for subsequent patterns.
+template <typename NewMatchContext, typename Pattern> struct SwitchContext {
+  const NewMatchContext &Ctx;
+  Pattern P;
+
+  template <typename OrigMatchContext>
+  bool match(const OrigMatchContext &, SDValue N) {
+    return P.match(Ctx, N);
+  }
+};
+
+template <typename MatchContext, typename Pattern>
+inline SwitchContext<MatchContext, Pattern> m_Context(const MatchContext &Ctx,
+                                                      Pattern &&P) {
+  return SwitchContext<MatchContext, Pattern>{Ctx, std::move(P)};
+}
+
+// === Value type ===
+struct ValueType_bind {
+  EVT &BindVT;
+
+  explicit ValueType_bind(EVT &Bind) : BindVT(Bind) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    BindVT = N.getValueType();
+    return true;
+  }
+};
+
+/// Retreive the ValueType of the current SDValue.
+inline ValueType_bind m_VT(EVT &VT) { return ValueType_bind(VT); }
+
+template <typename Pattern, typename PredFuncT> struct ValueType_match {
+  PredFuncT PredFunc;
+  Pattern P;
+
+  ValueType_match(const PredFuncT &Pred, const Pattern &P)
+      : PredFunc(Pred), P(P) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    return PredFunc(N.getValueType()) && P.match(Ctx, N);
+  }
+};
+
+// Explicit deduction guide.
+template <typename PredFuncT, typename Pattern>
+ValueType_match(const PredFuncT &Pred, const Pattern &P)
+    -> ValueType_match<Pattern, PredFuncT>;
+
+/// Match a specific ValueType.
+template <typename Pattern>
+inline auto m_SpecificVT(EVT RefVT, const Pattern &P) {
+  return ValueType_match{[=](EVT VT) { return VT == RefVT; }, P};
+}
+inline auto m_SpecificVT(EVT RefVT) {
+  return ValueType_match{[=](EVT VT) { return VT == RefVT; }, m_Value()};
+}
+
+inline auto m_Glue() { return m_SpecificVT(MVT::Glue); }
+inline auto m_OtherVT() { return m_SpecificVT(MVT::Other); }
+
+/// Match any integer ValueTypes.
+template <typename Pattern> inline auto m_IntegerVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isInteger(); }, P};
+}
+inline auto m_IntegerVT() {
+  return ValueType_match{[](EVT VT) { return VT.isInteger(); }, m_Value()};
+}
+
+/// Match any floating point ValueTypes.
+template <typename Pattern> inline auto m_FloatingPointVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isFloatingPoint(); }, P};
+}
+inline auto m_FloatingPointVT() {
+  return ValueType_match{[](EVT VT) { return VT.isFloatingPoint(); },
+                         m_Value()};
+}
+
+/// Match any vector ValueTypes.
+template <typename Pattern> inline auto m_VectorVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isVector(); }, P};
+}
+inline auto m_VectorVT() {
+  return ValueType_match{[](EVT VT) { return VT.isVector(); }, m_Value()};
+}
+
+/// Match fixed-length vector ValueTypes.
+template <typename Pattern> inline auto m_FixedVectorVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isFixedLengthVector(); }, P};
+}
+inline auto m_FixedVectorVT() {
+  return ValueType_match{[](EVT VT) { return VT.isFixedLengthVector(); },
+                         m_Value()};
+}
+
+/// Match scalable vector ValueTypes.
+template <typename Pattern> inline auto m_ScalableVectorVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isScalableVector(); }, P};
+}
+inline auto m_ScalableVectorVT() {
+  return ValueType_match{[](EVT VT) { return VT.isScalableVector(); },
+                         m_Value()};
+}
+
+/// Match legal ValueTypes based on the information provided by TargetLowering.
+template <typename Pattern> inline auto m_LegalType(const Pattern &P) {
+  return TLI_pred_match{[](const TargetLowering &TLI, SDValue N) {
+                          return TLI.isTypeLegal(N.getValueType());
+                        },
+                        P};
+}
+
+// === Patterns combinators ===
+template <typename... Preds> struct And {
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    return true;
+  }
+};
+
+template <typename Pred, typename... Preds>
+struct And<Pred, Preds...> : And<Preds...> {
+  Pred P;
+  And(Pred &&p, Preds &&...preds)
+      : And<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) {
+  }
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    return P.match(Ctx, N) && And<Preds...>::match(Ctx, N);
+  }
+};
+
+template <typename... Preds> struct Or {
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    return false;
+  }
+};
+
+template <typename Pred, typename... Preds>
+struct Or<Pred, Preds...> : Or<Preds...> {
+  Pred P;
+  Or(Pred &&p, Preds &&...preds)
+      : Or<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    return P.match(Ctx, N) || Or<Preds...>::match(Ctx, N);
+  }
+};
+
+template <typename... Preds> And<Preds...> m_AllOf(Preds &&...preds) {
+  return And<Preds...>(std::forward<Preds>(preds)...);
+}
+
+template <typename... Preds> Or<Preds...> m_AnyOf(Preds &&...preds) {
+  return Or<Preds...>(std::forward<Preds>(preds)...);
+}
+
+// === Generic node matching ===
+template <unsigned OpIdx, typename... OpndPreds> struct Operands_match {
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    // Returns false if there are more operands than predicates;
+    return N->getNumOperands() == OpIdx;
+  }
+};
+
+template <unsigned OpIdx, typename OpndPred, typename... OpndPreds>
+struct Operands_match<OpIdx, OpndPred, OpndPreds...>
+    : Operands_match<OpIdx + 1, OpndPreds...> {
+  OpndPred P;
+
+  Operands_match(OpndPred &&p, OpndPreds &&...preds)
+      : Operands_match<OpIdx + 1, OpndPreds...>(
+            std::forward<OpndPreds>(preds)...),
+        P(std::forward<OpndPred>(p)) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    if (OpIdx < N->getNumOperands())
+      return P.match(Ctx, N->getOperand(OpIdx)) &&
+             Operands_match<OpIdx + 1, OpndPreds...>::match(Ctx, N);
+
+    // This is the case where there are more predicates than operands.
+    return false;
+  }
+};
+
+template <typename... OpndPreds>
+auto m_Node(unsigned Opcode, OpndPreds &&...preds) {
+  return m_AllOf(m_Opc(Opcode), Operands_match<0, OpndPreds...>(
+                                    std::forward<OpndPreds>(preds)...));
+}
+
+/// Provide number of operands that are not chain or glue, as well as the first
+/// index of such operand.
+template <bool ExcludeChain> struct EffectiveOperands {
+  unsigned Size = 0;
+  unsigned FirstIndex = 0;
+
+  explicit EffectiveOperands(SDValue N) {
+    const unsigned TotalNumOps = N->getNumOperands();
+    FirstIndex = TotalNumOps;
+    for (unsigned I = 0; I < TotalNumOps; ++I) {
+      // Count the number of non-chain and non-glue nodes (we ignore chain
+      // and glue by default) and retreive the operand index offset.
+      EVT VT = N->getOperand(I).getValueType();
+      if (VT != MVT::Glue && VT != MVT::Other) {
+        ++Size;
+        if (FirstIndex == TotalNumOps)
+          FirstIndex = I;
+      }
+    }
+  }
+};
+
+template <> struct EffectiveOperands<false> {
+  unsigned Size = 0;
+  unsigned FirstIndex = 0;
+
+  explicit EffectiveOperands(SDValue N) : Size(N->getNumOperands()) {}
+};
+
+// === Binary operations ===
+template <typename LHS_P, typename RHS_P, bool Commutable = false,
+          bool ExcludeChain = false>
+struct BinaryOpc_match {
+  unsigned Opcode;
+  LHS_P LHS;
+  RHS_P RHS;
+
+  BinaryOpc_match(unsigned Opc, const LHS_P &L, const RHS_P &R)
+      : Opcode(Opc), LHS(L), RHS(R) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
+      EffectiveOperands<ExcludeChain> EO(N);
+      assert(EO.Size == 2);
+      return (LHS.match(Ctx, N->getOperand(EO.FirstIndex)) &&
+              RHS.match(Ctx, N->getOperand(EO.FirstIndex + 1))) ||
+             (Commutable && LHS.match(Ctx, N->getOperand(EO.FirstIndex + 1)) &&
+              RHS.match(Ctx, N->getOperand(EO.FirstIndex)));
+    }
+
+    return false;
+  }
+};
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_BinOp(unsigned Opc, const LHS &L,
+                                                const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(Opc, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_c_BinOp(unsigned Opc, const LHS &L,
+                                                 const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(Opc, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false, true>
+m_ChainedBinOp(unsigned Opc, const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false, true>(Opc, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true, true>
+m_c_ChainedBinOp(unsigned Opc, const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true, true>(Opc, L, R);
+}
+
+// Common binary operations
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_Add(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::ADD, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_Sub(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SUB, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_Mul(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::MUL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_UDiv(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::UDIV, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_SDiv(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SDIV, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_URem(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::UREM, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_SRem(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SREM, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_Shl(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SHL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_Sra(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SRA, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_Srl(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SRL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_FAdd(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::FADD, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_FSub(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::FSUB, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_FMul(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::FMUL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_FDiv(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::FDIV, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_FRem(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::FREM, L, R);
+}
+
+// === Unary operations ===
+template <typename Opnd_P, bool ExcludeChain = false> struct UnaryOpc_match {
+  unsigned Opcode;
+  Opnd_P Opnd;
+
+  UnaryOpc_match(unsigned Opc, const Opnd_P &Op) : Opcode(Opc), Opnd(Op) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
+      EffectiveOperands<ExcludeChain> EO(N);
+      assert(EO.Size == 1);
+      return Opnd.match(Ctx, N->getOperand(EO.FirstIndex));
+    }
+
+    return false;
+  }
+};
+
+template <typename Opnd>
+inline UnaryOpc_match<Opnd> m_UnaryOp(unsigned Opc, const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(Opc, Op);
+}
+template <typename Opnd>
+inline UnaryOpc_match<Opnd, true> m_ChainedUnaryOp(unsigned Opc,
+                                                   const Opnd &Op) {
+  return UnaryOpc_match<Opnd, true>(Opc, Op);
+}
+
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_ZExt(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::ZERO_EXTEND, Op);
+}
+
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_SExt(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::SIGN_EXTEND, Op);
+}
+
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_AnyExt(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::ANY_EXTEND, Op);
+}
+
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Trunc(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::TRUNCATE, Op);
+}
+
+// === Constants ===
+struct ConstantInt_match {
+  APInt *BindVal;
+
+  explicit ConstantInt_match(APInt *V) : BindVal(V) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    // The logics here are similar to that in
+    // SelectionDAG::isConstantIntBuildVectorOrConstantInt, but the latter also
+    // treats GlobalAddressSDNode as a constant, which is difficult to turn into
+    // APInt.
+    if (auto *C = dyn_cast_or_null<ConstantSDNode>(N.getNode())) {
+      if (BindVal)
+        *BindVal = C->getAPIntValue();
+      return true;
+    }
+
+    APInt Discard;
+    return ISD::isConstantSplatVector(N.getNode(),
+                                      BindVal ? *BindVal : Discard);
+  }
+};
+/// Match any interger constants or splat of an integer constant.
+inline ConstantInt_match m_ConstInt() { return ConstantInt_match(nullptr); }
+/// Match any interger constants or splat of an integer constant; return the
+/// specific constant or constant splat value.
+inline ConstantInt_match m_ConstInt(APInt &V) { return ConstantInt_match(&V); }
+
+struct SpecificInt_match {
+  APInt IntVal;
+
+  explicit SpecificInt_match(APInt APV) : IntVal(std::move(APV)) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    APInt ConstInt;
+    if (sd_context_match(N, Ctx, m_ConstInt(ConstInt)))
+      return APInt::isSameValue(IntVal, ConstInt);
+    return false;
+  }
+};
+
+/// Match a specific integer constant or constant splat value.
+inline SpecificInt_match m_SpecificInt(APInt V) {
+  return SpecificInt_match(std::move(V));
+}
+inline SpecificInt_match m_SpecificInt(uint64_t V) {
+  return SpecificInt_match(APInt(64, V));
+}
+
+inline SpecificInt_match m_Zero() { return m_SpecificInt(0U); }
+inline SpecificInt_match m_AllOnes() { return m_SpecificInt(~0U); }
+
+/// Match true boolean value based on the information provided by
+/// TargetLowering.
+inline auto m_True() {
+  return TLI_pred_match{
+      [](const TargetLowering &TLI, SDValue N) {
+        APInt ConstVal;
+        if (sd_match(N, m_ConstInt(ConstVal)))
+          switch (TLI.getBooleanContents(N.getValueType())) {
+          case TargetLowering::ZeroOrOneBooleanContent:
+            return ConstVal.isOne();
+          case TargetLowering::ZeroOrNegativeOneBooleanContent:
+            return ConstVal.isAllOnes();
+          case TargetLowering::UndefinedBooleanContent:
+            return (ConstVal & 0x01) == 1;
+          }
+
+        return false;
+      },
+      m_Value()};
+}
+/// Match false boolean value based on the information provided by
+/// TargetLowering.
+inline auto m_False() {
+  return TLI_pred_match{
+      [](const TargetLowering &TLI, SDValue N) {
+        APInt ConstVal;
+        if (sd_match(N, m_ConstInt(ConstVal)))
+          switch (TLI.getBooleanContents(N.getValueType())) {
+          case TargetLowering::ZeroOrOneBooleanContent:
+          case TargetLowering::ZeroOrNegativeOneBooleanContent:
+            return ConstVal.isZero();
+          case TargetLowering::UndefinedBooleanContent:
+            return (ConstVal & 0x01) == 0;
+          }
+
+        return false;
+      },
+      m_Value()};
+}
+} // namespace SDPatternMatch
+} // namespace llvm
+#endif
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 7bb12d8..2fc1cea 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1479,46 +1479,11 @@ public:
   SDValue getStridedLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                            EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
                            SDValue Offset, SDValue Stride, SDValue Mask,
-                           SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
-                           Align Alignment, MachineMemOperand::Flags MMOFlags,
-                           const AAMDNodes &AAInfo,
-                           const MDNode *Ranges = nullptr,
-                           bool IsExpanding = false);
-  inline SDValue getStridedLoadVP(
-      ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
-      SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
-      SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
-      MaybeAlign Alignment = MaybeAlign(),
-      MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
-      const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr,
-      bool IsExpanding = false) {
-    // Ensures that codegen never sees a None Alignment.
-    return getStridedLoadVP(AM, ExtType, VT, DL, Chain, Ptr, Offset, Stride,
-                            Mask, EVL, PtrInfo, MemVT,
-                            Alignment.value_or(getEVTAlign(MemVT)), MMOFlags,
-                            AAInfo, Ranges, IsExpanding);
-  }
-  SDValue getStridedLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
-                           EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
-                           SDValue Offset, SDValue Stride, SDValue Mask,
                            SDValue EVL, EVT MemVT, MachineMemOperand *MMO,
                            bool IsExpanding = false);
   SDValue getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
                            SDValue Stride, SDValue Mask, SDValue EVL,
-                           MachinePointerInfo PtrInfo, MaybeAlign Alignment,
-                           MachineMemOperand::Flags MMOFlags,
-                           const AAMDNodes &AAInfo,
-                           const MDNode *Ranges = nullptr,
-                           bool IsExpanding = false);
-  SDValue getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
-                           SDValue Stride, SDValue Mask, SDValue EVL,
                            MachineMemOperand *MMO, bool IsExpanding = false);
-  SDValue
-  getExtStridedLoadVP(ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT,
-                      SDValue Chain, SDValue Ptr, SDValue Stride, SDValue Mask,
-                      SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
-                      MaybeAlign Alignment, MachineMemOperand::Flags MMOFlags,
-                      const AAMDNodes &AAInfo, bool IsExpanding = false);
   SDValue getExtStridedLoadVP(ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT,
                               SDValue Chain, SDValue Ptr, SDValue Stride,
                               SDValue Mask, SDValue EVL, EVT MemVT,
@@ -1534,13 +1499,6 @@ public:
                             bool IsCompressing = false);
   SDValue getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL, SDValue Val,
                                  SDValue Ptr, SDValue Stride, SDValue Mask,
-                                 SDValue EVL, MachinePointerInfo PtrInfo,
-                                 EVT SVT, Align Alignment,
-                                 MachineMemOperand::Flags MMOFlags,
-                                 const AAMDNodes &AAInfo,
-                                 bool IsCompressing = false);
-  SDValue getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL, SDValue Val,
-                                 SDValue Ptr, SDValue Stride, SDValue Mask,
                                  SDValue EVL, EVT SVT, MachineMemOperand *MMO,
                                  bool IsCompressing = false);
   SDValue getIndexedStridedStoreVP(SDValue OrigStore, const SDLoc &DL,
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
index a26c44b..d368c7e 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
@@ -562,6 +562,17 @@ public:
     uint64_t getEntryOffset() const { return EntryOffset; }
   };
 
+  /// Offsets for the start of various important tables from the start of the
+  /// section.
+  struct DWARFDebugNamesOffsets {
+    uint64_t CUsBase;
+    uint64_t BucketsBase;
+    uint64_t HashesBase;
+    uint64_t StringOffsetsBase;
+    uint64_t EntryOffsetsBase;
+    uint64_t EntriesBase;
+  };
+
   /// Represents a single accelerator table within the DWARF v5 .debug_names
   /// section.
   class NameIndex {
@@ -572,12 +583,7 @@ public:
     // Base of the whole unit and of various important tables, as offsets from
     // the start of the section.
     uint64_t Base;
-    uint64_t CUsBase;
-    uint64_t BucketsBase;
-    uint64_t HashesBase;
-    uint64_t StringOffsetsBase;
-    uint64_t EntryOffsetsBase;
-    uint64_t EntriesBase;
+    DWARFDebugNamesOffsets Offsets;
 
     void dumpCUs(ScopedPrinter &W) const;
     void dumpLocalTUs(ScopedPrinter &W) const;
@@ -638,7 +644,7 @@ public:
     /// Returns the Entry at the relative `Offset` from the start of the Entry
     /// pool.
     Expected<Entry> getEntryAtRelativeOffset(uint64_t Offset) const {
-      auto OffsetFromSection = Offset + this->EntriesBase;
+      auto OffsetFromSection = Offset + this->Offsets.EntriesBase;
       return getEntry(&OffsetFromSection);
     }
 
@@ -793,6 +799,12 @@ public:
   const NameIndex *getCUNameIndex(uint64_t CUOffset);
 };
 
+/// Calculates the starting offsets for various sections within the
+/// .debug_names section.
+void findDebugNamesOffsets(DWARFDebugNames::DWARFDebugNamesOffsets &Offsets,
+                           uint64_t HdrSize, const dwarf::DwarfFormat Format,
+                           const DWARFDebugNames::Header &Hdr);
+
 /// If `Name` is the name of a templated function that includes template
 /// parameters, returns a substring of `Name` containing no template
 /// parameters.
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 1481328b..77d207f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -710,16 +710,14 @@ def OMP_Requires : Directive<"requires"> {
 }
 def OMP_Nothing : Directive<"nothing"> {}
 def OMP_TargetData : Directive<"target data"> {
-  let allowedClauses = [
-    VersionedClause<OMPC_UseDevicePtr>,
-    VersionedClause<OMPC_UseDeviceAddr, 50>
-  ];
   let allowedOnceClauses = [
     VersionedClause<OMPC_Device>,
     VersionedClause<OMPC_If>
   ];
   let requiredClauses = [
-    VersionedClause<OMPC_Map>
+    VersionedClause<OMPC_Map>,
+    VersionedClause<OMPC_UseDevicePtr>,
+    VersionedClause<OMPC_UseDeviceAddr, 50>
   ];
 }
 def OMP_TargetEnterData : Directive<"target enter data"> {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 864f87f..d22eb76 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -339,14 +339,26 @@ def UseSampleProfile : StrBoolAttr<"use-sample-profile">;
 def DenormalFPMath : ComplexStrAttr<"denormal-fp-math", [FnAttr]>;
 def DenormalFPMathF32 : ComplexStrAttr<"denormal-fp-math-f32", [FnAttr]>;
 
+// Attribute compatiblity rules are generated to check the attribute of the
+// caller and callee and decide whether inlining should be allowed. CompatRule
+// and child classes are used for the rule generation. CompatRule takes only a
+// compare function which could be templated with the attribute type.
+// CompatRuleStrAttr takes the compare function and the string attribute for
+// checking compatibility for inline substitution.
 class CompatRule<string F> {
-  // The name of the function called to check the attribute of the caller and
-  // callee and decide whether inlining should be allowed. The function's
-  // signature must match "bool(const Function&, const Function &)", where the
-  // first parameter is the reference to the caller and the second parameter is
-  // the reference to the callee. It must return false if the attributes of the
-  // caller and callee are incompatible, and true otherwise.
+  // The function's signature must match "bool(const Function&, const
+  // Function&)", where the first parameter is the reference to the caller and
+  // the second parameter is the reference to the callee. It must return false
+  // if the attributes of the caller and callee are incompatible, and true
+  // otherwise.
   string CompatFunc = F;
+  string AttrName = "";
+}
+
+class CompatRuleStrAttr<string F, string Attr> : CompatRule<F> {
+  // The checker function is extended with an third argument as the function
+  // attribute string "bool(const Function&, const Function&, const StringRef&)".
+  string AttrName = Attr;
 }
 
 def : CompatRule<"isEqual<SanitizeAddressAttr>">;
@@ -359,7 +371,9 @@ def : CompatRule<"isEqual<ShadowCallStackAttr>">;
 def : CompatRule<"isEqual<UseSampleProfileAttr>">;
 def : CompatRule<"isEqual<NoProfileAttr>">;
 def : CompatRule<"checkDenormMode">;
-
+def : CompatRuleStrAttr<"isEqual", "sign-return-address">;
+def : CompatRuleStrAttr<"isEqual", "sign-return-address-key">;
+def : CompatRuleStrAttr<"isEqual", "branch-protection-pauth-lr">;
 
 class MergeRule<string F> {
   // The name of the function called to merge the attributes of the caller and
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index b5dcc7f..c0ac9a4 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -78,13 +78,20 @@ public:
 /// Class for constant integers.
 class ConstantInt final : public ConstantData {
   friend class Constant;
+  friend class ConstantVector;
 
   APInt Val;
 
-  ConstantInt(IntegerType *Ty, const APInt &V);
+  ConstantInt(Type *Ty, const APInt &V);
 
   void destroyConstantImpl();
 
+  /// Return a ConstantInt with the specified value and an implied Type. The
+  /// type is the vector type whose integer element type corresponds to the bit
+  /// width of the value.
+  static ConstantInt *get(LLVMContext &Context, ElementCount EC,
+                          const APInt &V);
+
 public:
   ConstantInt(const ConstantInt &) = delete;
 
@@ -136,7 +143,7 @@ public:
   /// Return the constant's value.
   inline const APInt &getValue() const { return Val; }
 
-  /// getBitWidth - Return the bitwidth of this constant.
+  /// getBitWidth - Return the scalar bitwidth of this constant.
   unsigned getBitWidth() const { return Val.getBitWidth(); }
 
   /// Return the constant as a 64-bit unsigned integer value after it
@@ -259,6 +266,7 @@ public:
 ///
 class ConstantFP final : public ConstantData {
   friend class Constant;
+  friend class ConstantVector;
 
   APFloat Val;
 
@@ -266,6 +274,12 @@ class ConstantFP final : public ConstantData {
 
   void destroyConstantImpl();
 
+  /// Return a ConstantFP with the specified value and an implied Type. The
+  /// type is the vector type whose element type has the same floating point
+  /// semantics as the value.
+  static ConstantFP *get(LLVMContext &Context, ElementCount EC,
+                         const APFloat &V);
+
 public:
   ConstantFP(const ConstantFP &) = delete;
 
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 1fa6b6f..9708909 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -62,6 +62,8 @@ class BasicBlock;
 class MDNode;
 class Module;
 class DbgVariableIntrinsic;
+class DbgInfoIntrinsic;
+class DbgLabelInst;
 class DIAssignID;
 class DPMarker;
 class DPValue;
@@ -79,14 +81,14 @@ class raw_ostream;
 ///   deleteRecord
 ///   clone
 ///   isIdenticalToWhenDefined
-///   isEquivalentTo
 ///   both print methods
+///   createDebugIntrinsic
 class DbgRecord : public ilist_node<DbgRecord> {
 public:
   /// Marker that this DbgRecord is linked into.
   DPMarker *Marker = nullptr;
   /// Subclass discriminator.
-  enum Kind : uint8_t { ValueKind };
+  enum Kind : uint8_t { ValueKind, LabelKind };
 
 protected:
   DebugLoc DbgLoc;
@@ -104,9 +106,16 @@ public:
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &O, ModuleSlotTracker &MST, bool IsForDebug) const;
   bool isIdenticalToWhenDefined(const DbgRecord &R) const;
-  bool isEquivalentTo(const DbgRecord &R) const;
+  /// Convert this DbgRecord back into an appropriate llvm.dbg.* intrinsic.
+  /// \p InsertBefore Optional position to insert this intrinsic.
+  /// \returns A new llvm.dbg.* intrinsic representiung this DbgRecord.
+  DbgInfoIntrinsic *createDebugIntrinsic(Module *M,
+                                         Instruction *InsertBefore) const;
   ///@}
 
+  /// Same as isIdenticalToWhenDefined but checks DebugLoc too.
+  bool isEquivalentTo(const DbgRecord &R) const;
+
   Kind getRecordKind() const { return RecordKind; }
 
   void setMarker(DPMarker *M) { Marker = M; }
@@ -156,6 +165,38 @@ protected:
   ~DbgRecord() = default;
 };
 
+inline raw_ostream &operator<<(raw_ostream &OS, const DbgRecord &R) {
+  R.print(OS);
+  return OS;
+}
+
+/// Records a position in IR for a source label (DILabel). Corresponds to the
+/// llvm.dbg.label intrinsic.
+/// FIXME: Rename DbgLabelRecord when DPValue is renamed to DbgVariableRecord.
+class DPLabel : public DbgRecord {
+  DILabel *Label;
+
+public:
+  DPLabel(DILabel *Label, DebugLoc DL)
+      : DbgRecord(LabelKind, DL), Label(Label) {
+    assert(Label && "Unexpected nullptr");
+  }
+
+  DPLabel *clone() const;
+  void print(raw_ostream &O, bool IsForDebug = false) const;
+  void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const;
+  DbgLabelInst *createDebugIntrinsic(Module *M,
+                                     Instruction *InsertBefore) const;
+
+  void setLabel(DILabel *NewLabel) { Label = NewLabel; }
+  DILabel *getLabel() const { return Label; }
+
+  /// Support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(const DbgRecord *E) {
+    return E->getRecordKind() == LabelKind;
+  }
+};
+
 /// Record of a variable value-assignment, aka a non instruction representation
 /// of the dbg.value intrinsic.
 ///
@@ -510,11 +551,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DPMarker &Marker) {
   return OS;
 }
 
-inline raw_ostream &operator<<(raw_ostream &OS, const DPValue &Value) {
-  Value.print(OS);
-  return OS;
-}
-
 /// Inline helper to return a range of DPValues attached to a marker. It needs
 /// to be inlined as it's frequently called, but also come after the declaration
 /// of DPMarker. Thus: it's pre-declared by users like Instruction, then an
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index b8d578d..fbaaef8 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -531,6 +531,9 @@ public:
 class DbgLabelInst : public DbgInfoIntrinsic {
 public:
   DILabel *getLabel() const { return cast<DILabel>(getRawLabel()); }
+  void setLabel(DILabel *NewLabel) {
+    setArgOperand(0, MetadataAsValue::get(getContext(), NewLabel));
+  }
 
   Metadata *getRawLabel() const {
     return cast<MetadataAsValue>(getArgOperand(0))->getMetadata();
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8c0d4d5..d7c1ce1 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1933,6 +1933,26 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
+  def int_vp_sadd_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_uadd_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_ssub_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_usub_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
 
   // Floating-point arithmetic
   def int_vp_fadd : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 921e5b95..6b045e4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1367,6 +1367,27 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  llvm_i32_ty,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
+  
+  class SVE2_1VectorArg_Pred_Intrinsic
+    : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [llvm_anyvector_ty],
+                            [IntrNoMem]>;
+
+  class SVE2_1VectorArgIndexed_Pred_Intrinsic
+    : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [llvm_anyvector_ty, llvm_i32_ty],
+                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  
+  class SVE2_Pred_1VectorArgIndexed_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
+                            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+  class SVE2_Pred_1VectorArg_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [IntrNoMem]>;
 
   // NOTE: There is no relationship between these intrinsics beyond an attempt
   // to reuse currently identical class definitions.
@@ -3610,23 +3631,10 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic;
 //
 // SVE2.1 - Move predicate to/from vector
 //
-def int_aarch64_sve_pmov_to_pred_lane :
-    DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                          [llvm_anyvector_ty, llvm_i32_ty],
-                          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-
-def int_aarch64_sve_pmov_to_pred_lane_zero :
-    DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                          [llvm_anyvector_ty],
-                          [IntrNoMem]>;
-
-def int_aarch64_sve_pmov_to_vector_lane_merging :
-    DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                          [LLVMMatchType<0>,
-                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
-                          [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic; 
+    
+def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
 
-def int_aarch64_sve_pmov_to_vector_lane_zeroing :
-    DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                          [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                          [IntrNoMem]>;
+def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
+   
+def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
+\ No newline at end of file
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 3b32b60..4089acf 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -293,6 +293,30 @@ BEGIN_REGISTER_VP(vp_fshr, 3, 4, VP_FSHR, -1)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(fshr)
 VP_PROPERTY_FUNCTIONAL_SDOPC(FSHR)
 END_REGISTER_VP(vp_fshr, VP_FSHR)
+
+// llvm.vp.sadd.sat(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_sadd_sat, 2, 3, VP_SADDSAT, -1)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(sadd_sat)
+VP_PROPERTY_FUNCTIONAL_SDOPC(SADDSAT)
+END_REGISTER_VP(vp_sadd_sat, VP_SADDSAT)
+
+// llvm.vp.uadd.sat(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_uadd_sat, 2, 3, VP_UADDSAT, -1)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(uadd_sat)
+VP_PROPERTY_FUNCTIONAL_SDOPC(UADDSAT)
+END_REGISTER_VP(vp_uadd_sat, VP_UADDSAT)
+
+// llvm.vp.ssub.sat(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_ssub_sat, 2, 3, VP_SSUBSAT, -1)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(ssub_sat)
+VP_PROPERTY_FUNCTIONAL_SDOPC(SSUBSAT)
+END_REGISTER_VP(vp_ssub_sat, VP_SSUBSAT)
+
+// llvm.vp.usub.sat(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_usub_sat, 2, 3, VP_USUBSAT, -1)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(usub_sat)
+VP_PROPERTY_FUNCTIONAL_SDOPC(USUBSAT)
+END_REGISTER_VP(vp_usub_sat, VP_USUBSAT)
 ///// } Integer Arithmetic
 
 ///// Floating-Point Arithmetic {
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 6fb55f1..482b6e5 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -60,9 +60,6 @@ struct Config {
   bool VerifyEach = false;
   bool DisableVerify = false;
 
-  /// Use the standard optimization pipeline.
-  bool UseDefaultPipeline = false;
-
   /// Flag to indicate that the optimizer should not assume builtins are present
   /// on the target.
   bool Freestanding = false;
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 80bbfb7..dc60727 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/InterleavedLoadCombine.h"
 #include "llvm/CodeGen/JMCInstrumenter.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
+#include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
@@ -88,12 +89,8 @@ namespace llvm {
 #define DUMMY_MACHINE_MODULE_PASS(NAME, PASS_NAME)                             \
   struct PASS_NAME : public MachinePassInfoMixin<PASS_NAME> {                  \
     template <typename... Ts> PASS_NAME(Ts &&...) {}                           \
-    Error run(Module &, MachineFunctionAnalysisManager &) {                    \
-      return Error::success();                                                 \
-    }                                                                          \
-    PreservedAnalyses run(MachineFunction &,                                   \
-                          MachineFunctionAnalysisManager &) {                  \
-      llvm_unreachable("this api is to make new PM api happy");                \
+    PreservedAnalyses run(Module &, ModuleAnalysisManager &) {                 \
+      return PreservedAnalyses::all();                                         \
     }                                                                          \
   };
 #define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME)                           \
@@ -132,8 +129,8 @@ public:
       Opt.OptimizeRegAlloc = getOptLevel() != CodeGenOptLevel::None;
   }
 
-  Error buildPipeline(ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
-                      raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+  Error buildPipeline(ModulePassManager &MPM, raw_pwrite_stream &Out,
+                      raw_pwrite_stream *DwoOut,
                       CodeGenFileType FileType) const;
 
   PassInstrumentationCallbacks *getPassInstrumentationCallbacks() const {
@@ -149,7 +146,15 @@ protected:
   using is_function_pass_t = decltype(std::declval<PassT &>().run(
       std::declval<Function &>(), std::declval<FunctionAnalysisManager &>()));
 
+  template <typename PassT>
+  using is_machine_function_pass_t = decltype(std::declval<PassT &>().run(
+      std::declval<MachineFunction &>(),
+      std::declval<MachineFunctionAnalysisManager &>()));
+
   // Function object to maintain state while adding codegen IR passes.
+  // TODO: add a Function -> MachineFunction adaptor and merge
+  // AddIRPass/AddMachinePass so we can have a function pipeline that runs both
+  // function passes and machine function passes.
   class AddIRPass {
   public:
     AddIRPass(ModulePassManager &MPM, const DerivedT &PB) : MPM(MPM), PB(PB) {}
@@ -196,31 +201,47 @@ protected:
   // Function object to maintain state while adding codegen machine passes.
   class AddMachinePass {
   public:
-    AddMachinePass(MachineFunctionPassManager &PM, const DerivedT &PB)
-        : PM(PM), PB(PB) {}
+    AddMachinePass(ModulePassManager &MPM, const DerivedT &PB)
+        : MPM(MPM), PB(PB) {}
+    ~AddMachinePass() {
+      if (!MFPM.isEmpty())
+        MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+    }
+
+    template <typename PassT>
+    void operator()(PassT &&Pass, bool Force = false,
+                    StringRef Name = PassT::name()) {
+      static_assert((is_detected<is_machine_function_pass_t, PassT>::value ||
+                     is_detected<is_module_pass_t, PassT>::value) &&
+                    "Only module pass and function pass are supported.");
 
-    template <typename PassT> void operator()(PassT &&Pass) {
-      if (!PB.runBeforeAdding(PassT::name()))
+      if (!Force && !PB.runBeforeAdding(Name))
         return;
 
-      PM.addPass(std::forward<PassT>(Pass));
+      // Add Function Pass
+      if constexpr (is_detected<is_machine_function_pass_t, PassT>::value) {
+        MFPM.addPass(std::forward<PassT>(Pass));
 
-      for (auto &C : PB.AfterCallbacks)
-        C(PassT::name());
-    }
+        for (auto &C : PB.AfterCallbacks)
+          C(Name);
+      } else {
+        // Add Module Pass
+        if (!MFPM.isEmpty()) {
+          MPM.addPass(
+              createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+          MFPM = MachineFunctionPassManager();
+        }
 
-    template <typename PassT> void insertPass(StringRef PassName, PassT Pass) {
-      PB.AfterCallbacks.emplace_back(
-          [this, PassName, Pass = std::move(Pass)](StringRef Name) {
-            if (PassName == Name)
-              this->PM.addPass(std::move(Pass));
-          });
-    }
+        MPM.addPass(std::forward<PassT>(Pass));
 
-    MachineFunctionPassManager releasePM() { return std::move(PM); }
+        for (auto &C : PB.AfterCallbacks)
+          C(Name);
+      }
+    }
 
   private:
-    MachineFunctionPassManager &PM;
+    ModulePassManager &MPM;
+    MachineFunctionPassManager MFPM;
     const DerivedT &PB;
   };
 
@@ -467,30 +488,43 @@ private:
 
 template <typename Derived>
 Error CodeGenPassBuilder<Derived>::buildPipeline(
-    ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
-    raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
     CodeGenFileType FileType) const {
   auto StartStopInfo = TargetPassConfig::getStartStopInfo(*PIC);
   if (!StartStopInfo)
     return StartStopInfo.takeError();
   setStartStopPasses(*StartStopInfo);
-  AddIRPass addIRPass(MPM, derived());
-  // `ProfileSummaryInfo` is always valid.
-  addIRPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
-  addIRPass(RequireAnalysisPass<CollectorMetadataAnalysis, Module>());
-  addISelPasses(addIRPass);
 
-  AddMachinePass addPass(MFPM, derived());
+  bool PrintAsm = TargetPassConfig::willCompleteCodeGenPipeline();
+  bool PrintMIR = !PrintAsm && FileType != CodeGenFileType::Null;
+
+  {
+    AddIRPass addIRPass(MPM, derived());
+    addIRPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+    addIRPass(RequireAnalysisPass<CollectorMetadataAnalysis, Module>());
+    addISelPasses(addIRPass);
+  }
+
+  AddMachinePass addPass(MPM, derived());
+
+  if (PrintMIR)
+    addPass(PrintMIRPreparePass(Out), /*Force=*/true);
+
   if (auto Err = addCoreISelPasses(addPass))
     return std::move(Err);
 
   if (auto Err = derived().addMachinePasses(addPass))
     return std::move(Err);
 
-  derived().addAsmPrinter(
-      addPass, [this, &Out, DwoOut, FileType](MCContext &Ctx) {
-        return this->TM.createMCStreamer(Out, DwoOut, FileType, Ctx);
-      });
+  if (PrintAsm) {
+    derived().addAsmPrinter(
+        addPass, [this, &Out, DwoOut, FileType](MCContext &Ctx) {
+          return this->TM.createMCStreamer(Out, DwoOut, FileType, Ctx);
+        });
+  }
+
+  if (PrintMIR)
+    addPass(PrintMIRPass(Out), /*Force=*/true);
 
   addPass(FreeMachineFunctionPass());
   return verifyStartStop(*StartStopInfo);
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 10c5b7c..6822cfd 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -133,7 +133,8 @@ public:
   void crossRegisterProxies(LoopAnalysisManager &LAM,
                             FunctionAnalysisManager &FAM,
                             CGSCCAnalysisManager &CGAM,
-                            ModuleAnalysisManager &MAM);
+                            ModuleAnalysisManager &MAM,
+                            MachineFunctionAnalysisManager *MFAM = nullptr);
 
   /// Registers all available module analysis passes.
   ///
@@ -569,9 +570,9 @@ public:
     ModulePipelineParsingCallbacks.push_back(C);
   }
   void registerPipelineParsingCallback(
-      const std::function<bool(StringRef Name, MachineFunctionPassManager &)>
-          &C) {
-    MachinePipelineParsingCallbacks.push_back(C);
+      const std::function<bool(StringRef Name, MachineFunctionPassManager &,
+                               ArrayRef<PipelineElement>)> &C) {
+    MachineFunctionPipelineParsingCallbacks.push_back(C);
   }
   /// @}}
 
@@ -733,8 +734,10 @@ private:
   // Machine pass callbackcs
   SmallVector<std::function<void(MachineFunctionAnalysisManager &)>, 2>
       MachineFunctionAnalysisRegistrationCallbacks;
-  SmallVector<std::function<bool(StringRef, MachineFunctionPassManager &)>, 2>
-      MachinePipelineParsingCallbacks;
+  SmallVector<std::function<bool(StringRef, MachineFunctionPassManager &,
+                                 ArrayRef<PipelineElement>)>,
+              2>
+      MachineFunctionPipelineParsingCallbacks;
 };
 
 /// This utility template takes care of adding require<> and invalidate<>
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 7462f61..d7ce088 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -34,8 +34,6 @@ using ModulePassManager = PassManager<Module>;
 
 class Function;
 class GlobalValue;
-class MachineFunctionPassManager;
-class MachineFunctionAnalysisManager;
 class MachineModuleInfoWrapperPass;
 class Mangler;
 class MCAsmInfo;
@@ -455,11 +453,9 @@ public:
                       bool DisableVerify = true,
                       MachineModuleInfoWrapperPass *MMIWP = nullptr) override;
 
-  virtual Error buildCodeGenPipeline(ModulePassManager &,
-                                     MachineFunctionPassManager &,
-                                     MachineFunctionAnalysisManager &,
-                                     raw_pwrite_stream &, raw_pwrite_stream *,
-                                     CodeGenFileType, CGPassBuilderOption,
+  virtual Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
+                                     raw_pwrite_stream *, CodeGenFileType,
+                                     CGPassBuilderOption,
                                      PassInstrumentationCallbacks *) {
     return make_error<StringError>("buildCodeGenPipeline is not overridden",
                                    inconvertibleErrorCode());
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index e2781a5..48c9387 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -399,6 +399,8 @@ def NoSchedPred : MCSchedPredicate<TruePred>;
 class SchedVar<SchedPredicateBase pred, list<SchedReadWrite> selected> {
   SchedPredicateBase Predicate = pred;
   list<SchedReadWrite> Selected = selected;
+  // SchedModel silences warnings but is ignored.
+  SchedMachineModel SchedModel = ?;
 }
 
 // SchedModel silences warnings but is ignored.
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.def b/llvm/include/llvm/TargetParser/PPCTargetParser.def
index f2c44b4..88c7304 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.def
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.def
@@ -126,4 +126,61 @@ PPC_LNX_CPU("power10",47)
 #undef PPC_LNX_DEFINE_OFFSETS
 #undef PPC_LNX_FEATURE
 #undef PPC_LNX_CPU
+
+// Definition of the following values are found in the AIX header
+// file: </usr/include/sys/systemcfg.h>.
+#ifndef AIX_POWERPC_USE_SYS_CONF
+  #define AIX_POWERPC_USE_SYS_CONF
+  #define AIX_SYSCON_IMPL_IDX 1
+  #define AIX_PPC7_VALUE 0x00008000
+  #define AIX_PPC8_VALUE 0x00010000
+  #define AIX_PPC9_VALUE 0x00020000
+  #define AIX_PPC10_VALUE 0x00040000
+
+  // Supported SUPPORT_METHOD values.
+  #define AIX_BUILTIN_PPC_TRUE 1
+  #define AIX_BUILTIN_PPC_FALSE 0
+  #define USE_SYS_CONF 2
+
+  // Supported COMPARE_OP values.
+  #define COMP_EQ  0
+
+#endif
+
+// The value of SUPPORT_METHOD can be AIX_BUILTIN_PPC_TRUE,
+// AIX_BUILTIN_PPC_FALSE, or USE_SYS_CONF.
+// When the value of SUPPORT_METHOD is USE_SYS_CONF, the return value
+// depends on the result of comparing the data member of
+// _system_configuration specified by INDEX with a certain value.
+
+#ifndef PPC_AIX_CPU
+  #define PPC_AIX_CPU(NAME, SUPPORT_METHOD, INDEX, COMPARE_OP, VALUE)
+#endif
+
+// __builtin_cpu_is() is supported only on Power7 and up.
+PPC_AIX_CPU("power4",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc970",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power5",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power5+",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power6",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc-cell-be",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power6x",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppca2",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc405",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc440",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc464",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc476",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power7",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC7_VALUE)
+PPC_AIX_CPU("power8",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC8_VALUE)
+PPC_AIX_CPU("power9",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC9_VALUE)
+PPC_AIX_CPU("power10",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC10_VALUE)
+#undef PPC_AIX_CPU
+
+// PPC_SYSTEMCONFIG_TYPE defines the IR data structure of kernel variable
+// `_system_configuration`, that is found in the AIX OS header file: </usr/include/sys/systemcfg.h>.
+#ifndef PPC_SYSTEMCONFIG_TYPE
+#define PPC_SYSTEMCONFIG_TYPE \
+Int32Ty, Int32Ty, Int32Ty
+#endif
+
 #endif // !PPC_TGT_PARSER_UNDEF_MACROS
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 4c630c1..a9ed56f 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -265,6 +265,7 @@ X86_MICROARCH_LEVEL(X86_64_BASELINE,"x86-64",               95)
 X86_MICROARCH_LEVEL(X86_64_V2,      "x86-64-v2",            96)
 X86_MICROARCH_LEVEL(X86_64_V3,      "x86-64-v3",            97)
 X86_MICROARCH_LEVEL(X86_64_V4,      "x86-64-v4",            98)
+X86_MICROARCH_LEVEL(APXF,           "apxf",                111)
 #undef X86_FEATURE_COMPAT
 #undef X86_FEATURE
 #undef X86_MICROARCH_LEVEL
diff --git a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h
index c87c645..13dda6d 100644
--- a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h
+++ b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h
@@ -24,6 +24,7 @@ class TargetTransformInfo;
 class OptimizationRemarkEmitter;
 class AssumptionCache;
 class DominatorTree;
+class CodeExtractor;
 class CodeExtractorAnalysisCache;
 
 /// A sequence of basic blocks.
@@ -43,19 +44,17 @@ public:
 
 private:
   bool isFunctionCold(const Function &F) const;
-  bool isBasicBlockCold(BasicBlock* BB,
-                        BranchProbability ColdProbThresh,
-                        SmallPtrSetImpl<BasicBlock *> &ColdBlocks,
+  bool isBasicBlockCold(BasicBlock *BB, BranchProbability ColdProbThresh,
                         SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks,
                         BlockFrequencyInfo *BFI) const;
   bool shouldOutlineFrom(const Function &F) const;
   bool outlineColdRegions(Function &F, bool HasProfileSummary);
-  Function *extractColdRegion(const BlockSequence &Region,
+  bool isSplittingBeneficial(CodeExtractor &CE, const BlockSequence &Region,
+                             TargetTransformInfo &TTI);
+  Function *extractColdRegion(BasicBlock &EntryPoint, CodeExtractor &CE,
                               const CodeExtractorAnalysisCache &CEAC,
-                              DominatorTree &DT, BlockFrequencyInfo *BFI,
-                              TargetTransformInfo &TTI,
-                              OptimizationRemarkEmitter &ORE,
-                              AssumptionCache *AC, unsigned Count);
+                              BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
+                              OptimizationRemarkEmitter &ORE);
   ProfileSummaryInfo *PSI;
   function_ref<BlockFrequencyInfo *(Function &)> GetBFI;
   function_ref<TargetTransformInfo &(Function &)> GetTTI;
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 6bf0d2f..5916d2a 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -364,7 +364,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
 
   if (Size.getBitWidth() > 64)
     return false;
-  const uint64_t LoadSize = Size.getZExtValue();
+  const TypeSize LoadSize = TypeSize::getFixed(Size.getZExtValue());
 
   // Otherwise, be a little bit aggressive by scanning the local block where we
   // want to check to see if the pointer is already being loaded or stored
@@ -414,11 +414,11 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
 
     // Handle trivial cases.
     if (AccessedPtr == V &&
-        LoadSize <= DL.getTypeStoreSize(AccessedTy))
+        TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy)))
       return true;
 
     if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) &&
-        LoadSize <= DL.getTypeStoreSize(AccessedTy))
+        TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy)))
       return true;
   }
   return false;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 04f3172..653b3d4 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7194,6 +7194,21 @@ bool llvm::propagatesPoison(const Use &PoisonOp) {
         // corresponding lanes are poison.
         return true;
       case Intrinsic::ctpop:
+      case Intrinsic::ctlz:
+      case Intrinsic::cttz:
+      case Intrinsic::abs:
+      case Intrinsic::smax:
+      case Intrinsic::smin:
+      case Intrinsic::umax:
+      case Intrinsic::umin:
+      case Intrinsic::bitreverse:
+      case Intrinsic::bswap:
+      case Intrinsic::sadd_sat:
+      case Intrinsic::ssub_sat:
+      case Intrinsic::sshl_sat:
+      case Intrinsic::uadd_sat:
+      case Intrinsic::usub_sat:
+      case Intrinsic::ushl_sat:
         return true;
       }
     }
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 515a1d0..832907a 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3060,48 +3060,49 @@ Error BitcodeReader::parseConstants() {
       V = Constant::getNullValue(CurTy);
       break;
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
-      if (!CurTy->isIntegerTy() || Record.empty())
+      if (!CurTy->isIntOrIntVectorTy() || Record.empty())
         return error("Invalid integer const record");
       V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
     case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
-      if (!CurTy->isIntegerTy() || Record.empty())
+      if (!CurTy->isIntOrIntVectorTy() || Record.empty())
         return error("Invalid wide integer const record");
 
-      APInt VInt =
-          readWideAPInt(Record, cast<IntegerType>(CurTy)->getBitWidth());
-      V = ConstantInt::get(Context, VInt);
-
+      auto *ScalarTy = cast<IntegerType>(CurTy->getScalarType());
+      APInt VInt = readWideAPInt(Record, ScalarTy->getBitWidth());
+      V = ConstantInt::get(CurTy, VInt);
       break;
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
       if (Record.empty())
         return error("Invalid float const record");
-      if (CurTy->isHalfTy())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf(),
-                                             APInt(16, (uint16_t)Record[0])));
-      else if (CurTy->isBFloatTy())
-        V = ConstantFP::get(Context, APFloat(APFloat::BFloat(),
-                                             APInt(16, (uint32_t)Record[0])));
-      else if (CurTy->isFloatTy())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle(),
-                                             APInt(32, (uint32_t)Record[0])));
-      else if (CurTy->isDoubleTy())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble(),
-                                             APInt(64, Record[0])));
-      else if (CurTy->isX86_FP80Ty()) {
+
+      auto *ScalarTy = CurTy->getScalarType();
+      if (ScalarTy->isHalfTy())
+        V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEhalf(),
+                                           APInt(16, (uint16_t)Record[0])));
+      else if (ScalarTy->isBFloatTy())
+        V = ConstantFP::get(
+            CurTy, APFloat(APFloat::BFloat(), APInt(16, (uint32_t)Record[0])));
+      else if (ScalarTy->isFloatTy())
+        V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEsingle(),
+                                           APInt(32, (uint32_t)Record[0])));
+      else if (ScalarTy->isDoubleTy())
+        V = ConstantFP::get(
+            CurTy, APFloat(APFloat::IEEEdouble(), APInt(64, Record[0])));
+      else if (ScalarTy->isX86_FP80Ty()) {
         // Bits are not stored the same way as a normal i80 APInt, compensate.
         uint64_t Rearrange[2];
         Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16);
         Rearrange[1] = Record[0] >> 48;
-        V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended(),
-                                             APInt(80, Rearrange)));
-      } else if (CurTy->isFP128Ty())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad(),
-                                             APInt(128, Record)));
-      else if (CurTy->isPPC_FP128Ty())
-        V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble(),
-                                             APInt(128, Record)));
+        V = ConstantFP::get(
+            CurTy, APFloat(APFloat::x87DoubleExtended(), APInt(80, Rearrange)));
+      } else if (ScalarTy->isFP128Ty())
+        V = ConstantFP::get(CurTy,
+                            APFloat(APFloat::IEEEquad(), APInt(128, Record)));
+      else if (ScalarTy->isPPC_FP128Ty())
+        V = ConstantFP::get(
+            CurTy, APFloat(APFloat::PPCDoubleDouble(), APInt(128, Record)));
       else
         V = UndefValue::get(CurTy);
       break;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 13be0b0..656f2a6 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2624,7 +2624,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
       }
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
-      Type *Ty = CFP->getType();
+      Type *Ty = CFP->getType()->getScalarType();
       if (Ty->isHalfTy() || Ty->isBFloatTy() || Ty->isFloatTy() ||
           Ty->isDoubleTy()) {
         Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 7b66a85..3b84624 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -829,11 +829,7 @@ class MemLocFragmentFill {
   void process(BasicBlock &BB, VarFragMap &LiveSet) {
     BBInsertBeforeMap[&BB].clear();
     for (auto &I : BB) {
-      for (DbgRecord &DR : I.getDbgValueRange()) {
-        // FIXME: DPValue::filter usage needs attention in this file; we need
-        // to make sure dbg.labels are handled correctly in RemoveDIs mode.
-        // Cast below to ensure this gets fixed when DPLabels are introduced.
-        DPValue &DPV = cast<DPValue>(DR);
+      for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
         if (const auto *Locs = FnVarLocs->getWedge(&DPV)) {
           for (const VarLocInfo &Loc : *Locs) {
             addDef(Loc, &DPV, *I.getParent(), LiveSet);
@@ -1919,6 +1915,9 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
     // attached DPValues, or a non-debug instruction with attached unprocessed
     // DPValues.
     if (II != EI && II->hasDbgValues()) {
+      // Skip over non-variable debug records (i.e., labels). They're going to
+      // be read from IR (possibly re-ordering them within the debug record
+      // range) rather than from the analysis results.
       for (DPValue &DPV : DPValue::filter(II->getDbgValueRange())) {
         resetInsertionPoint(DPV);
         processDPValue(DPV, LiveSet);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 4036f18..feefe87 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2686,8 +2686,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
             attributesPermitTailCall(F, CI, RetI, *TLI)) {
           // Either we return void or the return value must be the first
           // argument of a known intrinsic or library function.
-          if (!V || (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
-                     V == CI->getArgOperand(0))) {
+          if (!V || isa<UndefValue>(V) ||
+              (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
+               V == CI->getArgOperand(0))) {
             TailCallBBs.push_back(Pred);
           }
         }
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 3bd1542..77dc265 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -187,7 +187,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   if (!lowerCall(MIRBuilder, Info))
     return false;
 
-  if (ReturnHintAlignReg && !Info.IsTailCall) {
+  if (ReturnHintAlignReg && !Info.LoweredTailCall) {
     MIRBuilder.buildAssertAlign(ResRegs[0], ReturnHintAlignReg,
                                 ReturnHintAlign);
   }
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 7c95cef..38bb808 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -3275,7 +3275,17 @@ void IRTranslator::translateDbgDeclareRecord(Value *Address, bool HasArgList,
 
 void IRTranslator::translateDbgInfo(const Instruction &Inst,
                                       MachineIRBuilder &MIRBuilder) {
-  for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) {
+  for (DbgRecord &DR : Inst.getDbgValueRange()) {
+    if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+      MIRBuilder.setDebugLoc(DPL->getDebugLoc());
+      assert(DPL->getLabel() && "Missing label");
+      assert(DPL->getLabel()->isValidLocationForIntrinsic(
+                 MIRBuilder.getDebugLoc()) &&
+             "Expected inlined-at fields to agree");
+      MIRBuilder.buildDbgLabel(DPL->getLabel());
+      continue;
+    }
+    DPValue &DPV = cast<DPValue>(DR);
     const DILocalVariable *Variable = DPV.getVariable();
     const DIExpression *Expression = DPV.getExpression();
     Value *V = DPV.getVariableLocationOp(0);
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 26fd12f..23ad68b 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -660,8 +660,11 @@ std::optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode,
   default:
     break;
   case TargetOpcode::G_ADD:
-  case TargetOpcode::G_PTR_ADD:
     return C1 + C2;
+  case TargetOpcode::G_PTR_ADD:
+    // Types can be of different width here.
+    // Result needs to be the same width as C1, so trunc or sext C2.
+    return C1 + C2.sextOrTrunc(C1.getBitWidth());
   case TargetOpcode::G_AND:
     return C1 & C2;
   case TargetOpcode::G_ASHR:
diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
index 38c1c56..0ddd945 100644
--- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -109,12 +109,6 @@ static bool isColdBlock(const MachineBasicBlock &MBB,
                         const MachineBlockFrequencyInfo *MBFI,
                         ProfileSummaryInfo *PSI) {
   std::optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB);
-
-  // Temporary hack to cope with AArch64's jump table encoding
-  const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
-  if (!TII.isMBBSafeToSplitToCold(MBB))
-    return false;
-
   // For instrumentation profiles and sample profiles, we use different ways
   // to judge whether a block is cold and should be split.
   if (PSI->hasInstrumentationProfile() || PSI->hasCSInstrumentationProfile()) {
@@ -178,7 +172,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
 
     if (MBB.isEHPad())
       LandingPads.push_back(&MBB);
-    else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && !SplitAllEHCode)
+    else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) &&
+             TII.isMBBSafeToSplitToCold(MBB) && !SplitAllEHCode)
       MBB.setSectionID(MBBSectionID::ColdSectionID);
   }
 
@@ -190,7 +185,7 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
     // Here we have UseProfileData == true.
     bool HasHotLandingPads = false;
     for (const MachineBasicBlock *LP : LandingPads) {
-      if (!isColdBlock(*LP, MBFI, PSI))
+      if (!isColdBlock(*LP, MBFI, PSI) || !TII.isMBBSafeToSplitToCold(*LP))
         HasHotLandingPads = true;
     }
     if (!HasHotLandingPads) {
diff --git a/llvm/lib/CodeGen/MachinePassManager.cpp b/llvm/lib/CodeGen/MachinePassManager.cpp
index d42bbe2..9a750b5 100644
--- a/llvm/lib/CodeGen/MachinePassManager.cpp
+++ b/llvm/lib/CodeGen/MachinePassManager.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachinePassManager.h"
-#include "llvm/CodeGen/FreeMachineFunction.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/PassManagerImpl.h"
@@ -19,99 +18,121 @@
 using namespace llvm;
 
 namespace llvm {
-template class AllAnalysesOn<MachineFunction>;
+
+AnalysisKey FunctionAnalysisManagerMachineFunctionProxy::Key;
+
 template class AnalysisManager<MachineFunction>;
 template class PassManager<MachineFunction>;
+template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
+                                         Module>;
+template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
+                                         MachineFunction>;
+
+bool FunctionAnalysisManagerMachineFunctionProxy::Result::invalidate(
+    MachineFunction &IR, const PreservedAnalyses &PA,
+    MachineFunctionAnalysisManager::Invalidator &Inv) {
+  // MachineFunction passes should not invalidate Function analyses.
+  // TODO: verify that PA doesn't invalidate Function analyses.
+  return false;
+}
 
-Error MachineFunctionPassManager::run(Module &M,
-                                      MachineFunctionAnalysisManager &MFAM) {
-  // MachineModuleAnalysis is a module analysis pass that is never invalidated
-  // because we don't run any module pass in codegen pipeline. This is very
-  // important because the codegen state is stored in MMI which is the analysis
-  // result of MachineModuleAnalysis. MMI should not be recomputed.
-  auto &MMI = MFAM.getResult<MachineModuleAnalysis>(M).getMMI();
-
-  (void)RequireCodeGenSCCOrder;
-  assert(!RequireCodeGenSCCOrder && "not implemented");
-
-  // M is unused here
-  PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(M);
-
-  // Add a PIC to verify machine functions.
-  if (VerifyMachineFunction) {
-    // No need to pop this callback later since MIR pipeline is flat which means
-    // current pipeline is the top-level pipeline. Callbacks are not used after
-    // current pipeline.
-    PI.pushBeforeNonSkippedPassCallback([](StringRef PassID, Any IR) {
-      assert(llvm::any_cast<const MachineFunction *>(&IR));
-      const MachineFunction *MF = llvm::any_cast<const MachineFunction *>(IR);
-      assert(MF && "Machine function should be valid for printing");
-      std::string Banner = std::string("After ") + std::string(PassID);
-      verifyMachineFunction(Banner, *MF);
-    });
+template <>
+bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate(
+    Module &M, const PreservedAnalyses &PA,
+    ModuleAnalysisManager::Invalidator &Inv) {
+  // If literally everything is preserved, we're done.
+  if (PA.areAllPreserved())
+    return false; // This is still a valid proxy.
+
+  // If this proxy isn't marked as preserved, then even if the result remains
+  // valid, the key itself may no longer be valid, so we clear everything.
+  //
+  // Note that in order to preserve this proxy, a module pass must ensure that
+  // the MFAM has been completely updated to handle the deletion of functions.
+  // Specifically, any MFAM-cached results for those functions need to have been
+  // forcibly cleared. When preserved, this proxy will only invalidate results
+  // cached on functions *still in the module* at the end of the module pass.
+  auto PAC = PA.getChecker<MachineFunctionAnalysisManagerModuleProxy>();
+  if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Module>>()) {
+    InnerAM->clear();
+    return true;
   }
 
-  for (auto &F : InitializationFuncs) {
-    if (auto Err = F(M, MFAM))
-      return Err;
+  // FIXME: be more precise, see
+  // FunctionAnalysisManagerModuleProxy::Result::invalidate.
+  if (!PA.allAnalysesInSetPreserved<AllAnalysesOn<MachineFunction>>()) {
+    InnerAM->clear();
+    return true;
   }
 
-  unsigned Idx = 0;
-  size_t Size = Passes.size();
-  do {
-    // Run machine module passes
-    for (; MachineModulePasses.count(Idx) && Idx != Size; ++Idx) {
-      if (!PI.runBeforePass<Module>(*Passes[Idx], M))
-        continue;
-      if (auto Err = MachineModulePasses.at(Idx)(M, MFAM))
-        return Err;
-      PI.runAfterPass(*Passes[Idx], M, PreservedAnalyses::all());
-    }
-
-    // Finish running all passes.
-    if (Idx == Size)
-      break;
-
-    // Run machine function passes
-
-    // Get index range of machine function passes.
-    unsigned Begin = Idx;
-    for (; !MachineModulePasses.count(Idx) && Idx != Size; ++Idx)
-      ;
-
-    for (Function &F : M) {
-      // Do not codegen any 'available_externally' functions at all, they have
-      // definitions outside the translation unit.
-      if (F.hasAvailableExternallyLinkage())
-        continue;
-
-      MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
-
-      for (unsigned I = Begin, E = Idx; I != E; ++I) {
-        auto *P = Passes[I].get();
+  // Return false to indicate that this result is still a valid proxy.
+  return false;
+}
 
-        if (!PI.runBeforePass<MachineFunction>(*P, MF))
-          continue;
+PreservedAnalyses
+ModuleToMachineFunctionPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
+  auto &MMI = AM.getResult<MachineModuleAnalysis>(M).getMMI();
+  MachineFunctionAnalysisManager &MFAM =
+      AM.getResult<MachineFunctionAnalysisManagerModuleProxy>(M).getManager();
+  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(M);
+  PreservedAnalyses PA = PreservedAnalyses::all();
+  for (Function &F : M) {
+    // Do not codegen any 'available_externally' functions at all, they have
+    // definitions outside the translation unit.
+    if (F.hasAvailableExternallyLinkage())
+      continue;
+
+    MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+
+    if (!PI.runBeforePass<MachineFunction>(*Pass, MF))
+      continue;
+    PreservedAnalyses PassPA = Pass->run(MF, MFAM);
+    if (MMI.getMachineFunction(F)) {
+      MFAM.invalidate(MF, PassPA);
+      PI.runAfterPass(*Pass, MF, PassPA);
+    } else {
+      MFAM.clear(MF, F.getName());
+      PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA);
+    }
+    PA.intersect(std::move(PassPA));
+  }
 
-        // TODO: EmitSizeRemarks
-        PreservedAnalyses PassPA = P->run(MF, MFAM);
+  return PA;
+}
 
-        // MF is dangling after FreeMachineFunctionPass
-        if (P->name() != FreeMachineFunctionPass::name()) {
-          MFAM.invalidate(MF, PassPA);
+void ModuleToMachineFunctionPassAdaptor::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  OS << "machine-function(";
+  Pass->printPipeline(OS, MapClassName2PassName);
+  OS << ')';
+}
 
-          PI.runAfterPass(*P, MF, PassPA);
-        }
-      }
+template <>
+PreservedAnalyses
+PassManager<MachineFunction>::run(MachineFunction &MF,
+                                  AnalysisManager<MachineFunction> &MFAM) {
+  PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(MF);
+  Function &F = MF.getFunction();
+  MachineModuleInfo &MMI =
+      MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)
+          .getCachedResult<MachineModuleAnalysis>(*F.getParent())
+          ->getMMI();
+  PreservedAnalyses PA = PreservedAnalyses::all();
+  for (auto &Pass : Passes) {
+    if (!PI.runBeforePass<MachineFunction>(*Pass, MF))
+      continue;
+
+    PreservedAnalyses PassPA = Pass->run(MF, MFAM);
+    if (MMI.getMachineFunction(F)) {
+      MFAM.invalidate(MF, PassPA);
+      PI.runAfterPass(*Pass, MF, PassPA);
+    } else {
+      MFAM.clear(MF, F.getName());
+      PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA);
     }
-  } while (true);
-
-  for (auto &F : FinalizationFuncs) {
-    if (auto Err = F(M, MFAM))
-      return Err;
+    PA.intersect(std::move(PassPA));
   }
-
-  return Error::success();
+  return PA;
 }
 
 } // namespace llvm
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 697e0da..1bda19b 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -768,7 +768,6 @@ static void getUnderlyingObjects(const MachineInstr *MI,
       Objs.clear();
       return;
     }
-    Objs.push_back(V);
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 89ef648..6a28bc8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -76,6 +76,8 @@
 #include <utility>
 #include <variant>
 
+#include "MatchContext.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "dagcombine"
@@ -888,141 +890,6 @@ public:
   void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
 };
 
-class EmptyMatchContext {
-  SelectionDAG &DAG;
-  const TargetLowering &TLI;
-
-public:
-  EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
-      : DAG(DAG), TLI(TLI) {}
-
-  bool match(SDValue OpN, unsigned Opcode) const {
-    return Opcode == OpN->getOpcode();
-  }
-
-  // Same as SelectionDAG::getNode().
-  template <typename... ArgT> SDValue getNode(ArgT &&...Args) {
-    return DAG.getNode(std::forward<ArgT>(Args)...);
-  }
-
-  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
-                                bool LegalOnly = false) const {
-    return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly);
-  }
-};
-
-class VPMatchContext {
-  SelectionDAG &DAG;
-  const TargetLowering &TLI;
-  SDValue RootMaskOp;
-  SDValue RootVectorLenOp;
-
-public:
-  VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
-      : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() {
-    assert(Root->isVPOpcode());
-    if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode()))
-      RootMaskOp = Root->getOperand(*RootMaskPos);
-    else if (Root->getOpcode() == ISD::VP_SELECT)
-      RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root),
-                                          Root->getOperand(0).getValueType());
-
-    if (auto RootVLenPos =
-            ISD::getVPExplicitVectorLengthIdx(Root->getOpcode()))
-      RootVectorLenOp = Root->getOperand(*RootVLenPos);
-  }
-
-  /// whether \p OpVal is a node that is functionally compatible with the
-  /// NodeType \p Opc
-  bool match(SDValue OpVal, unsigned Opc) const {
-    if (!OpVal->isVPOpcode())
-      return OpVal->getOpcode() == Opc;
-
-    auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(),
-                                           !OpVal->getFlags().hasNoFPExcept());
-    if (BaseOpc != Opc)
-      return false;
-
-    // Make sure the mask of OpVal is true mask or is same as Root's.
-    unsigned VPOpcode = OpVal->getOpcode();
-    if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) {
-      SDValue MaskOp = OpVal.getOperand(*MaskPos);
-      if (RootMaskOp != MaskOp &&
-          !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode()))
-        return false;
-    }
-
-    // Make sure the EVL of OpVal is same as Root's.
-    if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode))
-      if (RootVectorLenOp != OpVal.getOperand(*VLenPos))
-        return false;
-    return true;
-  }
-
-  // Specialize based on number of operands.
-  // TODO emit VP intrinsics where MaskOp/VectorLenOp != null
-  // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return
-  // DAG.getNode(Opcode, DL, VT); }
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
-    return DAG.getNode(VPOpcode, DL, VT,
-                       {Operand, RootMaskOp, RootVectorLenOp});
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
-    return DAG.getNode(VPOpcode, DL, VT,
-                       {N1, N2, RootMaskOp, RootVectorLenOp});
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2, SDValue N3) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
-    return DAG.getNode(VPOpcode, DL, VT,
-                       {N1, N2, N3, RootMaskOp, RootVectorLenOp});
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
-                  SDNodeFlags Flags) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
-    return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp},
-                       Flags);
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2, SDNodeFlags Flags) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
-    return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp},
-                       Flags);
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2, SDValue N3, SDNodeFlags Flags) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
-    return DAG.getNode(VPOpcode, DL, VT,
-                       {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags);
-  }
-
-  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
-                                bool LegalOnly = false) const {
-    unsigned VPOp = ISD::getVPForBaseOpcode(Op);
-    return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly);
-  }
-};
-
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -13997,6 +13864,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
     return Res;
 
+  // CSE zext nneg with sext if the zext is not free.
+  if (N->getFlags().hasNonNeg() && !TLI.isZExtFree(N0.getValueType(), VT)) {
+    SDNode *CSENode = DAG.getNodeIfExists(ISD::SIGN_EXTEND, N->getVTList(), N0);
+    if (CSENode)
+      return SDValue(CSENode, 0);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 5651498..246762d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1188,11 +1188,24 @@ void FastISel::handleDbgInfo(const Instruction *II) {
   MIMD = MIMetadata();
 
   // Reverse order of debug records, because fast-isel walks through backwards.
-  for (DbgRecord &DPR : llvm::reverse(II->getDbgValueRange())) {
+  for (DbgRecord &DR : llvm::reverse(II->getDbgValueRange())) {
     flushLocalValueMap();
     recomputeInsertPt();
 
-    DPValue &DPV = cast<DPValue>(DPR);
+    if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+      assert(DPL->getLabel() && "Missing label");
+      if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
+        LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DPL << "\n");
+        continue;
+      }
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DPL->getDebugLoc(),
+              TII.get(TargetOpcode::DBG_LABEL))
+          .addMetadata(DPL->getLabel());
+      continue;
+    }
+
+    DPValue &DPV = cast<DPValue>(DR);
 
     Value *V = nullptr;
     if (!DPV.hasArgList())
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a4ba261..df17d65 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -217,7 +217,15 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SSUBSAT:
   case ISD::USUBSAT:
   case ISD::SSHLSAT:
-  case ISD::USHLSAT:     Res = PromoteIntRes_ADDSUBSHLSAT(N); break;
+  case ISD::USHLSAT:
+    Res = PromoteIntRes_ADDSUBSHLSAT<EmptyMatchContext>(N);
+    break;
+  case ISD::VP_SADDSAT:
+  case ISD::VP_UADDSAT:
+  case ISD::VP_SSUBSAT:
+  case ISD::VP_USUBSAT:
+    Res = PromoteIntRes_ADDSUBSHLSAT<VPMatchContext>(N);
+    break;
 
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
@@ -934,6 +942,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   return DAG.getBoolExtOrTrunc(Res.getValue(1), dl, NVT, VT);
 }
 
+template <class MatchContextClass>
 SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   // If the promoted type is legal, we can convert this to:
   //   1. ANY_EXTEND iN to iM
@@ -945,11 +954,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   SDLoc dl(N);
   SDValue Op1 = N->getOperand(0);
   SDValue Op2 = N->getOperand(1);
+  MatchContextClass matcher(DAG, TLI, N);
   unsigned OldBits = Op1.getScalarValueSizeInBits();
 
-  unsigned Opcode = N->getOpcode();
+  unsigned Opcode = matcher.getRootBaseOpcode();
   bool IsShift = Opcode == ISD::USHLSAT || Opcode == ISD::SSHLSAT;
 
+  // FIXME: We need vp-aware PromotedInteger functions.
   SDValue Op1Promoted, Op2Promoted;
   if (IsShift) {
     Op1Promoted = GetPromotedInteger(Op1);
@@ -968,18 +979,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
     APInt MaxVal = APInt::getAllOnes(OldBits).zext(NewBits);
     SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
     SDValue Add =
-        DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
-    return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
+        matcher.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
+    return matcher.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
   }
 
   // USUBSAT can always be promoted as long as we have zero-extended the args.
   if (Opcode == ISD::USUBSAT)
-    return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
-                       Op2Promoted);
+    return matcher.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
+                           Op2Promoted);
 
   // Shift cannot use a min/max expansion, we can't detect overflow if all of
   // the bits have been shifted out.
-  if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) {
+  if (IsShift || matcher.isOperationLegal(Opcode, PromotedType)) {
     unsigned ShiftOp;
     switch (Opcode) {
     case ISD::SADDSAT:
@@ -1002,11 +1013,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
         DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
     if (!IsShift)
       Op2Promoted =
-          DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
+          matcher.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
 
     SDValue Result =
-        DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
-    return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
+        matcher.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
+    return matcher.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
   }
 
   unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB;
@@ -1015,9 +1026,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType);
   SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
   SDValue Result =
-      DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted);
-  Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax);
-  Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin);
+      matcher.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted);
+  Result = matcher.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax);
+  Result = matcher.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin);
   return Result;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9114987..3c84f67 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
 
+#include "MatchContext.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -355,6 +356,7 @@ private:
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_VSCALE(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+  template <class MatchContextClass>
   SDValue PromoteIntRes_ADDSUBSHLSAT(SDNode *N);
   SDValue PromoteIntRes_MULFIX(SDNode *N);
   SDValue PromoteIntRes_DIVFIX(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 2a7aaf8..6074498 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -404,8 +404,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FCEIL:
   case ISD::FTRUNC:
   case ISD::FRINT:
-  case ISD::LRINT:
-  case ISD::LLRINT:
   case ISD::FNEARBYINT:
   case ISD::FROUND:
   case ISD::FROUNDEVEN:
@@ -455,6 +453,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
                                               Node->getValueType(0), Scale);
     break;
   }
+  case ISD::LRINT:
+  case ISD::LLRINT:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::VECREDUCE_ADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7fc2526..90cda2a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1163,10 +1163,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX: case ISD::VP_SMAX:
   case ISD::UMIN: case ISD::VP_UMIN:
   case ISD::UMAX: case ISD::VP_UMAX:
-  case ISD::SADDSAT:
-  case ISD::UADDSAT:
-  case ISD::SSUBSAT:
-  case ISD::USUBSAT:
+  case ISD::SADDSAT: case ISD::VP_SADDSAT:
+  case ISD::UADDSAT: case ISD::VP_UADDSAT:
+  case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
+  case ISD::USUBSAT: case ISD::VP_USUBSAT:
   case ISD::SSHLSAT:
   case ISD::USHLSAT:
   case ISD::ROTL:
@@ -4140,10 +4140,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX: case ISD::VP_SMAX:
   case ISD::UMIN: case ISD::VP_UMIN:
   case ISD::UMAX: case ISD::VP_UMAX:
-  case ISD::UADDSAT:
-  case ISD::SADDSAT:
-  case ISD::USUBSAT:
-  case ISD::SSUBSAT:
+  case ISD::UADDSAT: case ISD::VP_UADDSAT:
+  case ISD::SADDSAT: case ISD::VP_SADDSAT:
+  case ISD::USUBSAT: case ISD::VP_USUBSAT:
+  case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
   case ISD::SSHLSAT:
   case ISD::USHLSAT:
   case ISD::ROTL:
diff --git a/llvm/lib/CodeGen/SelectionDAG/MatchContext.h b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
new file mode 100644
index 0000000..f965cb9
--- /dev/null
+++ b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
@@ -0,0 +1,175 @@
+//===---------------- llvm/CodeGen/MatchContext.h  --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the EmptyMatchContext class and VPMatchContext class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+using namespace llvm;
+
+namespace {
+class EmptyMatchContext {
+  SelectionDAG &DAG;
+  const TargetLowering &TLI;
+  SDNode *Root;
+
+public:
+  EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
+      : DAG(DAG), TLI(TLI), Root(Root) {}
+
+  unsigned getRootBaseOpcode() { return Root->getOpcode(); }
+  bool match(SDValue OpN, unsigned Opcode) const {
+    return Opcode == OpN->getOpcode();
+  }
+
+  // Same as SelectionDAG::getNode().
+  template <typename... ArgT> SDValue getNode(ArgT &&...Args) {
+    return DAG.getNode(std::forward<ArgT>(Args)...);
+  }
+
+  bool isOperationLegal(unsigned Op, EVT VT) const {
+    return TLI.isOperationLegal(Op, VT);
+  }
+
+  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
+                                bool LegalOnly = false) const {
+    return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly);
+  }
+};
+
+class VPMatchContext {
+  SelectionDAG &DAG;
+  const TargetLowering &TLI;
+  SDValue RootMaskOp;
+  SDValue RootVectorLenOp;
+  SDNode *Root;
+
+public:
+  VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *_Root)
+      : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() {
+    Root = _Root;
+    assert(Root->isVPOpcode());
+    if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode()))
+      RootMaskOp = Root->getOperand(*RootMaskPos);
+    else if (Root->getOpcode() == ISD::VP_SELECT)
+      RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root),
+                                          Root->getOperand(0).getValueType());
+
+    if (auto RootVLenPos = ISD::getVPExplicitVectorLengthIdx(Root->getOpcode()))
+      RootVectorLenOp = Root->getOperand(*RootVLenPos);
+  }
+
+  unsigned getRootBaseOpcode() {
+    std::optional<unsigned> Opcode = ISD::getBaseOpcodeForVP(
+        Root->getOpcode(), !Root->getFlags().hasNoFPExcept());
+    assert(Opcode.has_value());
+    return *Opcode;
+  }
+
+  /// whether \p OpVal is a node that is functionally compatible with the
+  /// NodeType \p Opc
+  bool match(SDValue OpVal, unsigned Opc) const {
+    if (!OpVal->isVPOpcode())
+      return OpVal->getOpcode() == Opc;
+
+    auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(),
+                                           !OpVal->getFlags().hasNoFPExcept());
+    if (BaseOpc != Opc)
+      return false;
+
+    // Make sure the mask of OpVal is true mask or is same as Root's.
+    unsigned VPOpcode = OpVal->getOpcode();
+    if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) {
+      SDValue MaskOp = OpVal.getOperand(*MaskPos);
+      if (RootMaskOp != MaskOp &&
+          !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode()))
+        return false;
+    }
+
+    // Make sure the EVL of OpVal is same as Root's.
+    if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode))
+      if (RootVectorLenOp != OpVal.getOperand(*VLenPos))
+        return false;
+    return true;
+  }
+
+  // Specialize based on number of operands.
+  // TODO emit VP intrinsics where MaskOp/VectorLenOp != null
+  // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return
+  // DAG.getNode(Opcode, DL, VT); }
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {Operand, RootMaskOp, RootVectorLenOp});
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
+    return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp});
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDValue N3) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {N1, N2, N3, RootMaskOp, RootVectorLenOp});
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
+                  SDNodeFlags Flags) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
+    return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp},
+                       Flags);
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDNodeFlags Flags) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
+    return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp},
+                       Flags);
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDValue N3, SDNodeFlags Flags) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags);
+  }
+
+  bool isOperationLegal(unsigned Op, EVT VT) const {
+    unsigned VPOp = ISD::getVPForBaseOpcode(Op);
+    return TLI.isOperationLegal(VPOp, VT);
+  }
+
+  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
+                                bool LegalOnly = false) const {
+    unsigned VPOp = ISD::getVPForBaseOpcode(Op);
+    return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly);
+  }
+};
+} // end anonymous namespace
+#endif
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index add92cf..0ceda27 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9044,29 +9044,6 @@ SDValue SelectionDAG::getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl,
 SDValue SelectionDAG::getStridedLoadVP(
     ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
     SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
-    SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT, Align Alignment,
-    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
-    const MDNode *Ranges, bool IsExpanding) {
-  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
-
-  MMOFlags |= MachineMemOperand::MOLoad;
-  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
-  // If we don't have a PtrInfo, infer the trivial frame index case to simplify
-  // clients.
-  if (PtrInfo.V.isNull())
-    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
-
-  uint64_t Size = MemoryLocation::UnknownSize;
-  MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
-                                                   Alignment, AAInfo, Ranges);
-  return getStridedLoadVP(AM, ExtType, VT, DL, Chain, Ptr, Offset, Stride, Mask,
-                          EVL, MemVT, MMO, IsExpanding);
-}
-
-SDValue SelectionDAG::getStridedLoadVP(
-    ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
-    SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
     SDValue EVL, EVT MemVT, MachineMemOperand *MMO, bool IsExpanding) {
   bool Indexed = AM != ISD::UNINDEXED;
   assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
@@ -9098,17 +9075,6 @@ SDValue SelectionDAG::getStridedLoadVP(
   return V;
 }
 
-SDValue SelectionDAG::getStridedLoadVP(
-    EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr, SDValue Stride,
-    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, MaybeAlign Alignment,
-    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
-    const MDNode *Ranges, bool IsExpanding) {
-  SDValue Undef = getUNDEF(Ptr.getValueType());
-  return getStridedLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, DL, Chain, Ptr,
-                          Undef, Stride, Mask, EVL, PtrInfo, VT, Alignment,
-                          MMOFlags, AAInfo, Ranges, IsExpanding);
-}
-
 SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain,
                                        SDValue Ptr, SDValue Stride,
                                        SDValue Mask, SDValue EVL,
@@ -9121,18 +9087,6 @@ SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain,
 
 SDValue SelectionDAG::getExtStridedLoadVP(
     ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
-    SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL,
-    MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment,
-    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
-    bool IsExpanding) {
-  SDValue Undef = getUNDEF(Ptr.getValueType());
-  return getStridedLoadVP(ISD::UNINDEXED, ExtType, VT, DL, Chain, Ptr, Undef,
-                          Stride, Mask, EVL, PtrInfo, MemVT, Alignment,
-                          MMOFlags, AAInfo, nullptr, IsExpanding);
-}
-
-SDValue SelectionDAG::getExtStridedLoadVP(
-    ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
     SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL, EVT MemVT,
     MachineMemOperand *MMO, bool IsExpanding) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
@@ -9150,11 +9104,14 @@ SDValue SelectionDAG::getIndexedStridedLoadVP(SDValue OrigLoad, const SDLoc &DL,
   auto MMOFlags =
       SLD->getMemOperand()->getFlags() &
       ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
-  return getStridedLoadVP(
-      AM, SLD->getExtensionType(), OrigLoad.getValueType(), DL, SLD->getChain(),
-      Base, Offset, SLD->getStride(), SLD->getMask(), SLD->getVectorLength(),
-      SLD->getPointerInfo(), SLD->getMemoryVT(), SLD->getAlign(), MMOFlags,
-      SLD->getAAInfo(), nullptr, SLD->isExpandingLoad());
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      SLD->getPointerInfo(), MMOFlags, SLD->getMemOperand()->getSize(),
+      SLD->getOriginalAlign(), SLD->getAAInfo());
+  return getStridedLoadVP(AM, SLD->getExtensionType(), OrigLoad.getValueType(),
+                          DL, SLD->getChain(), Base, Offset, SLD->getStride(),
+                          SLD->getMask(), SLD->getVectorLength(),
+                          SLD->getMemoryVT(), MMO, SLD->isExpandingLoad());
 }
 
 SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL,
@@ -9193,26 +9150,6 @@ SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL,
   return V;
 }
 
-SDValue SelectionDAG::getTruncStridedStoreVP(
-    SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Stride,
-    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, EVT SVT,
-    Align Alignment, MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
-    bool IsCompressing) {
-  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
-
-  MMOFlags |= MachineMemOperand::MOStore;
-  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
-
-  if (PtrInfo.V.isNull())
-    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
-
-  MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MMOFlags, MemoryLocation::UnknownSize, Alignment, AAInfo);
-  return getTruncStridedStoreVP(Chain, DL, Val, Ptr, Stride, Mask, EVL, SVT,
-                                MMO, IsCompressing);
-}
-
 SDValue SelectionDAG::getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL,
                                              SDValue Val, SDValue Ptr,
                                              SDValue Stride, SDValue Mask,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e893a5b..ee600d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1241,17 +1241,30 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
                              It->Expr, Vals.size() > 1, It->DL, SDNodeOrder);
       }
     }
-    // We must early-exit here to prevent any DPValues from being emitted below,
-    // as we have just emitted the debug values resulting from assignment
-    // tracking analysis, making any existing DPValues redundant (and probably
-    // less correct).
-    return;
   }
 
+  // We must skip DPValues if they've already been processed above as we
+  // have just emitted the debug values resulting from assignment tracking
+  // analysis, making any existing DPValues redundant (and probably less
+  // correct). We still need to process DPLabels. This does sink DPLabels
+  // to the bottom of the group of debug records. That sholdn't be important
+  // as it does so deterministcally and ordering between DPLabels and DPValues
+  // is immaterial (other than for MIR/IR printing).
+  bool SkipDPValues = DAG.getFunctionVarLocs();
   // Is there is any debug-info attached to this instruction, in the form of
-  // DPValue non-instruction debug-info records.
-  for (DbgRecord &DPR : I.getDbgValueRange()) {
-    DPValue &DPV = cast<DPValue>(DPR);
+  // DbgRecord non-instruction debug-info records.
+  for (DbgRecord &DR : I.getDbgValueRange()) {
+    if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+      assert(DPL->getLabel() && "Missing label");
+      SDDbgLabel *SDV =
+          DAG.getDbgLabel(DPL->getLabel(), DPL->getDebugLoc(), SDNodeOrder);
+      DAG.AddDbgLabel(SDV);
+      continue;
+    }
+
+    if (SkipDPValues)
+      continue;
+    DPValue &DPV = cast<DPValue>(DR);
     DILocalVariable *Variable = DPV.getVariable();
     DIExpression *Expression = DPV.getExpression();
     dropDanglingDebugInfo(Variable, Expression);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a4c5167..07fb891 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10948,12 +10948,10 @@ SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const {
     Op = expandRoundInexactToOdd(F32, Op, dl, DAG);
     Op = DAG.getNode(ISD::BITCAST, dl, I32, Op);
 
-    // Extract the sign bit and exponent.
-    SDValue SignBitAndExponentField = DAG.getNode(
-        ISD::AND, dl, I32, Op, DAG.getConstant(0xff800000, dl, I32));
-    // Set the quiet bit.
-    SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBitAndExponentField,
-                              DAG.getConstant(0x400000, dl, I32));
+    // Conversions should set NaN's quiet bit. This also prevents NaNs from
+    // turning into infinities.
+    SDValue NaN =
+        DAG.getNode(ISD::OR, dl, I32, Op, DAG.getConstant(0x400000, dl, I32));
 
     // Factor in the contribution of the low 16 bits.
     SDValue One = DAG.getConstant(1, dl, I32);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 78f819d..9c65d85 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -510,7 +510,7 @@ DWARFDebugNames::Abbrev DWARFDebugNames::AbbrevMapInfo::getTombstoneKey() {
 
 Expected<DWARFDebugNames::AttributeEncoding>
 DWARFDebugNames::NameIndex::extractAttributeEncoding(uint64_t *Offset) {
-  if (*Offset >= EntriesBase) {
+  if (*Offset >= Offsets.EntriesBase) {
     return createStringError(errc::illegal_byte_sequence,
                              "Incorrectly terminated abbreviation table.");
   }
@@ -536,7 +536,7 @@ DWARFDebugNames::NameIndex::extractAttributeEncodings(uint64_t *Offset) {
 
 Expected<DWARFDebugNames::Abbrev>
 DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) {
-  if (*Offset >= EntriesBase) {
+  if (*Offset >= Offsets.EntriesBase) {
     return createStringError(errc::illegal_byte_sequence,
                              "Incorrectly terminated abbreviation table.");
   }
@@ -552,32 +552,50 @@ DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) {
   return Abbrev(Code, dwarf::Tag(Tag), AbbrevOffset, std::move(*AttrEncOr));
 }
 
+void llvm::findDebugNamesOffsets(
+    DWARFDebugNames::DWARFDebugNamesOffsets &Offsets, uint64_t HdrSize,
+    dwarf::DwarfFormat Format, const DWARFDebugNames::Header &Hdr) {
+  uint32_t DwarfSize = (Format == llvm::dwarf::DwarfFormat::DWARF64) ? 8 : 4;
+  uint64_t Offset = HdrSize;
+  Offsets.CUsBase = Offset;
+  Offset += Hdr.CompUnitCount * DwarfSize;
+  Offset += Hdr.LocalTypeUnitCount * DwarfSize;
+  Offset += Hdr.ForeignTypeUnitCount * 8;
+
+  Offsets.BucketsBase = Offset;
+  Offset += Hdr.BucketCount * 4;
+
+  Offsets.HashesBase = Offset;
+  if (Hdr.BucketCount > 0)
+    Offset += Hdr.NameCount * 4;
+
+  Offsets.StringOffsetsBase = Offset;
+  Offset += Hdr.NameCount * DwarfSize;
+
+  Offsets.EntryOffsetsBase = Offset;
+  Offset += Hdr.NameCount * DwarfSize;
+
+  Offset += Hdr.AbbrevTableSize;
+  Offsets.EntriesBase = Offset;
+}
+
 Error DWARFDebugNames::NameIndex::extract() {
   const DWARFDataExtractor &AS = Section.AccelSection;
-  uint64_t Offset = Base;
-  if (Error E = Hdr.extract(AS, &Offset))
+  uint64_t hdrSize = Base;
+  if (Error E = Hdr.extract(AS, &hdrSize))
     return E;
 
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
-  CUsBase = Offset;
-  Offset += Hdr.CompUnitCount * SectionOffsetSize;
-  Offset += Hdr.LocalTypeUnitCount * SectionOffsetSize;
-  Offset += Hdr.ForeignTypeUnitCount * 8;
-  BucketsBase = Offset;
-  Offset += Hdr.BucketCount * 4;
-  HashesBase = Offset;
-  if (Hdr.BucketCount > 0)
-    Offset += Hdr.NameCount * 4;
-  StringOffsetsBase = Offset;
-  Offset += Hdr.NameCount * SectionOffsetSize;
-  EntryOffsetsBase = Offset;
-  Offset += Hdr.NameCount * SectionOffsetSize;
+  findDebugNamesOffsets(Offsets, hdrSize, Hdr.Format, Hdr);
+
+  uint64_t Offset =
+      Offsets.EntryOffsetsBase + (Hdr.NameCount * SectionOffsetSize);
 
   if (!AS.isValidOffsetForDataOfSize(Offset, Hdr.AbbrevTableSize))
     return createStringError(errc::illegal_byte_sequence,
                              "Section too small: cannot read abbreviations.");
 
-  EntriesBase = Offset + Hdr.AbbrevTableSize;
+  Offsets.EntriesBase = Offset + Hdr.AbbrevTableSize;
 
   for (;;) {
     auto AbbrevOr = extractAbbrev(&Offset);
@@ -679,7 +697,7 @@ void DWARFDebugNames::Entry::dumpParentIdx(
     return;
   }
 
-  auto AbsoluteOffset = NameIdx->EntriesBase + FormValue.getRawUValue();
+  auto AbsoluteOffset = NameIdx->Offsets.EntriesBase + FormValue.getRawUValue();
   W.getOStream() << "Entry @ 0x" + Twine::utohexstr(AbsoluteOffset);
 }
 
@@ -708,14 +726,15 @@ std::error_code DWARFDebugNames::SentinelError::convertToErrorCode() const {
 uint64_t DWARFDebugNames::NameIndex::getCUOffset(uint32_t CU) const {
   assert(CU < Hdr.CompUnitCount);
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
-  uint64_t Offset = CUsBase + SectionOffsetSize * CU;
+  uint64_t Offset = Offsets.CUsBase + SectionOffsetSize * CU;
   return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset);
 }
 
 uint64_t DWARFDebugNames::NameIndex::getLocalTUOffset(uint32_t TU) const {
   assert(TU < Hdr.LocalTypeUnitCount);
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
-  uint64_t Offset = CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU);
+  uint64_t Offset =
+      Offsets.CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU);
   return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset);
 }
 
@@ -723,7 +742,7 @@ uint64_t DWARFDebugNames::NameIndex::getForeignTUSignature(uint32_t TU) const {
   assert(TU < Hdr.ForeignTypeUnitCount);
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
   uint64_t Offset =
-      CUsBase +
+      Offsets.CUsBase +
       SectionOffsetSize * (Hdr.CompUnitCount + Hdr.LocalTypeUnitCount) + 8 * TU;
   return Section.AccelSection.getU64(&Offset);
 }
@@ -759,28 +778,28 @@ DWARFDebugNames::NameIndex::getNameTableEntry(uint32_t Index) const {
   assert(0 < Index && Index <= Hdr.NameCount);
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
   uint64_t StringOffsetOffset =
-      StringOffsetsBase + SectionOffsetSize * (Index - 1);
+      Offsets.StringOffsetsBase + SectionOffsetSize * (Index - 1);
   uint64_t EntryOffsetOffset =
-      EntryOffsetsBase + SectionOffsetSize * (Index - 1);
+      Offsets.EntryOffsetsBase + SectionOffsetSize * (Index - 1);
   const DWARFDataExtractor &AS = Section.AccelSection;
 
   uint64_t StringOffset =
       AS.getRelocatedValue(SectionOffsetSize, &StringOffsetOffset);
   uint64_t EntryOffset = AS.getUnsigned(&EntryOffsetOffset, SectionOffsetSize);
-  EntryOffset += EntriesBase;
+  EntryOffset += Offsets.EntriesBase;
   return {Section.StringSection, Index, StringOffset, EntryOffset};
 }
 
 uint32_t
 DWARFDebugNames::NameIndex::getBucketArrayEntry(uint32_t Bucket) const {
   assert(Bucket < Hdr.BucketCount);
-  uint64_t BucketOffset = BucketsBase + 4 * Bucket;
+  uint64_t BucketOffset = Offsets.BucketsBase + 4 * Bucket;
   return Section.AccelSection.getU32(&BucketOffset);
 }
 
 uint32_t DWARFDebugNames::NameIndex::getHashArrayEntry(uint32_t Index) const {
   assert(0 < Index && Index <= Hdr.NameCount);
-  uint64_t HashOffset = HashesBase + 4 * (Index - 1);
+  uint64_t HashOffset = Offsets.HashesBase + 4 * (Index - 1);
   return Section.AccelSection.getU32(&HashOffset);
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 28f0564..572628f 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -389,9 +389,25 @@ Error DWARFDebugLine::Prologue::parse(
 
   if (getVersion() >= 5) {
     FormParams.AddrSize = DebugLineData.getU8(Cursor);
-    assert((!Cursor || DebugLineData.getAddressSize() == 0 ||
-            DebugLineData.getAddressSize() == getAddressSize()) &&
-           "Line table header and data extractor disagree");
+    const uint8_t DataAddrSize = DebugLineData.getAddressSize();
+    const uint8_t PrologueAddrSize = getAddressSize();
+    if (Cursor) {
+      if (DataAddrSize == 0) {
+        if (PrologueAddrSize != 4 && PrologueAddrSize != 8) {
+          RecoverableErrorHandler(createStringError(
+              errc::not_supported,
+              "parsing line table prologue at offset 0x%8.8" PRIx64
+              ": invalid address size %" PRIu8,
+              PrologueOffset, PrologueAddrSize));
+        }
+      } else if (DataAddrSize != PrologueAddrSize) {
+        RecoverableErrorHandler(createStringError(
+            errc::not_supported,
+            "parsing line table prologue at offset 0x%8.8" PRIx64 ": address "
+            "size %" PRIu8 " doesn't match architecture address size %" PRIu8,
+            PrologueOffset, PrologueAddrSize, DataAddrSize));
+      }
+    }
     SegSelectorSize = DebugLineData.getU8(Cursor);
   }
 
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 251485a..fba404c 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -292,8 +292,8 @@ static const Module *getModuleFromDPI(const DPMarker *Marker) {
   return M ? M->getParent() : nullptr;
 }
 
-static const Module *getModuleFromDPI(const DPValue *DPV) {
-  return DPV->getMarker() ? getModuleFromDPI(DPV->getMarker()) : nullptr;
+static const Module *getModuleFromDPI(const DbgRecord *DR) {
+  return DR->getMarker() ? getModuleFromDPI(DR->getMarker()) : nullptr;
 }
 
 static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
@@ -1141,12 +1141,14 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
     CreateMetadataSlot(DPV->getVariable());
-    CreateMetadataSlot(DPV->getDebugLoc());
     if (DPV->isDbgAssign())
       CreateMetadataSlot(DPV->getAssignID());
+  } else if (const DPLabel *DPL = dyn_cast<const DPLabel>(&DR)) {
+    CreateMetadataSlot(DPL->getLabel());
   } else {
     llvm_unreachable("unsupported DbgRecord kind");
   }
+  CreateMetadataSlot(DR.getDebugLoc());
 }
 
 void SlotTracker::processInstructionMetadata(const Instruction &I) {
@@ -1505,16 +1507,39 @@ static void WriteAPFloatInternal(raw_ostream &Out, const APFloat &APF) {
 static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
                                   AsmWriterContext &WriterCtx) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
-    if (CI->getType()->isIntegerTy(1)) {
-      Out << (CI->getZExtValue() ? "true" : "false");
-      return;
+    Type *Ty = CI->getType();
+
+    if (Ty->isVectorTy()) {
+      Out << "splat (";
+      WriterCtx.TypePrinter->print(Ty->getScalarType(), Out);
+      Out << " ";
     }
-    Out << CI->getValue();
+
+    if (Ty->getScalarType()->isIntegerTy(1))
+      Out << (CI->getZExtValue() ? "true" : "false");
+    else
+      Out << CI->getValue();
+
+    if (Ty->isVectorTy())
+      Out << ")";
+
     return;
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    Type *Ty = CFP->getType();
+
+    if (Ty->isVectorTy()) {
+      Out << "splat (";
+      WriterCtx.TypePrinter->print(Ty->getScalarType(), Out);
+      Out << " ";
+    }
+
     WriteAPFloatInternal(Out, CFP->getValueAPF());
+
+    if (Ty->isVectorTy())
+      Out << ")";
+
     return;
   }
 
@@ -2676,6 +2701,7 @@ public:
   void printInstruction(const Instruction &I);
   void printDPMarker(const DPMarker &DPI);
   void printDPValue(const DPValue &DPI);
+  void printDPLabel(const DPLabel &DPL);
   void printDbgRecord(const DbgRecord &DPI);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
@@ -4579,8 +4605,10 @@ void AssemblyWriter::printDPMarker(const DPMarker &Marker) {
 void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
   if (auto *DPV = dyn_cast<DPValue>(&DR))
     printDPValue(*DPV);
+  else if (auto *DPL = dyn_cast<DPLabel>(&DR))
+    printDPLabel(*DPL);
   else
-    llvm_unreachable("unsupported dbg record");
+    llvm_unreachable("Unexpected DbgRecord kind");
 }
 
 void AssemblyWriter::printDPValue(const DPValue &Value) {
@@ -4622,6 +4650,16 @@ void AssemblyWriter::printDPValue(const DPValue &Value) {
   Out << " }";
 }
 
+void AssemblyWriter::printDPLabel(const DPLabel &Label) {
+  // There's no formal representation of a DPLabel -- print purely as
+  // a debugging aid.
+  Out << "  DPLabel { ";
+  auto WriterCtx = getContext();
+  WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true);
+  Out << " marker @" << Label.getMarker();
+  Out << " }";
+}
+
 void AssemblyWriter::printMetadataAttachments(
     const SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs,
     StringRef Separator) {
@@ -4885,6 +4923,12 @@ void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
   W.printDPMarker(*this);
 }
 
+void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
+
+  ModuleSlotTracker MST(getModuleFromDPI(this), true);
+  print(ROS, MST, IsForDebug);
+}
+
 void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
   // There's no formal representation of a DPValue -- print purely as a
@@ -4904,6 +4948,24 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
   W.printDPValue(*this);
 }
 
+void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
+                    bool IsForDebug) const {
+  // There's no formal representation of a DbgLabelRecord -- print purely as
+  // a debugging aid.
+  formatted_raw_ostream OS(ROS);
+  SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
+  SlotTracker &SlotTable =
+      MST.getMachine() ? *MST.getMachine() : EmptySlotTable;
+  auto incorporateFunction = [&](const Function *F) {
+    if (F)
+      MST.incorporateFunction(*F);
+  };
+  incorporateFunction(Marker->getParent() ? Marker->getParent()->getParent()
+                                          : nullptr);
+  AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug);
+  W.printDPLabel(*this);
+}
+
 void Value::print(raw_ostream &ROS, bool IsForDebug) const {
   bool ShouldInitializeAllMetadata = false;
   if (auto *I = dyn_cast<Instruction>(this))
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index fd51602..1907677 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -2045,6 +2045,11 @@ static bool isEqual(const Function &Caller, const Function &Callee) {
          Callee.getFnAttribute(AttrClass::getKind());
 }
 
+static bool isEqual(const Function &Caller, const Function &Callee,
+                    const StringRef &AttrName) {
+  return Caller.getFnAttribute(AttrName) == Callee.getFnAttribute(AttrName);
+}
+
 /// Compute the logical AND of the attributes of the caller and the
 /// callee.
 ///
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 06807544..6ea876f 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -81,6 +81,12 @@ void BasicBlock::convertToNewDbgValues() {
       continue;
     }
 
+    if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(&I)) {
+      DPVals.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc()));
+      DLI->eraseFromParent();
+      continue;
+    }
+
     if (DPVals.empty())
       continue;
 
@@ -107,16 +113,12 @@ void BasicBlock::convertFromNewDbgValues() {
       continue;
 
     DPMarker &Marker = *Inst.DbgMarker;
-    for (DbgRecord &DR : Marker.getDbgValueRange()) {
-      if (auto *DPV = dyn_cast<DPValue>(&DR))
-        InstList.insert(Inst.getIterator(),
-                        DPV->createDebugIntrinsic(getModule(), nullptr));
-      else
-        llvm_unreachable("unsupported DbgRecord kind");
-    }
+    for (DbgRecord &DR : Marker.getDbgValueRange())
+      InstList.insert(Inst.getIterator(),
+                      DR.createDebugIntrinsic(getModule(), nullptr));
 
     Marker.eraseFromParent();
-  };
+  }
 
   // Assume no trailing DPValues: we could technically create them at the end
   // of the block, after a terminator, but this would be non-cannonical and
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index a38b912..e6b92aa 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -35,6 +35,20 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+// As set of temporary options to help migrate how splats are represented.
+static cl::opt<bool> UseConstantIntForFixedLengthSplat(
+    "use-constant-int-for-fixed-length-splat", cl::init(false), cl::Hidden,
+    cl::desc("Use ConstantInt's native fixed-length vector splat support."));
+static cl::opt<bool> UseConstantFPForFixedLengthSplat(
+    "use-constant-fp-for-fixed-length-splat", cl::init(false), cl::Hidden,
+    cl::desc("Use ConstantFP's native fixed-length vector splat support."));
+static cl::opt<bool> UseConstantIntForScalableSplat(
+    "use-constant-int-for-scalable-splat", cl::init(false), cl::Hidden,
+    cl::desc("Use ConstantInt's native scalable vector splat support."));
+static cl::opt<bool> UseConstantFPForScalableSplat(
+    "use-constant-fp-for-scalable-splat", cl::init(false), cl::Hidden,
+    cl::desc("Use ConstantFP's native scalable vector splat support."));
+
 //===----------------------------------------------------------------------===//
 //                              Constant Class
 //===----------------------------------------------------------------------===//
@@ -825,9 +839,11 @@ bool Constant::isManifestConstant() const {
 //                                ConstantInt
 //===----------------------------------------------------------------------===//
 
-ConstantInt::ConstantInt(IntegerType *Ty, const APInt &V)
+ConstantInt::ConstantInt(Type *Ty, const APInt &V)
     : ConstantData(Ty, ConstantIntVal), Val(V) {
-  assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
+  assert(V.getBitWidth() ==
+             cast<IntegerType>(Ty->getScalarType())->getBitWidth() &&
+         "Invalid constant for type");
 }
 
 ConstantInt *ConstantInt::getTrue(LLVMContext &Context) {
@@ -885,6 +901,26 @@ ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) {
   return Slot.get();
 }
 
+// Get a ConstantInt vector with each lane set to the same APInt.
+ConstantInt *ConstantInt::get(LLVMContext &Context, ElementCount EC,
+                              const APInt &V) {
+  // Get an existing value or the insertion position.
+  std::unique_ptr<ConstantInt> &Slot =
+      Context.pImpl->IntSplatConstants[std::make_pair(EC, V)];
+  if (!Slot) {
+    IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
+    VectorType *VTy = VectorType::get(ITy, EC);
+    Slot.reset(new ConstantInt(VTy, V));
+  }
+
+#ifndef NDEBUG
+  IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
+  VectorType *VTy = VectorType::get(ITy, EC);
+  assert(Slot->getType() == VTy);
+#endif
+  return Slot.get();
+}
+
 Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) {
   Constant *C = get(cast<IntegerType>(Ty->getScalarType()), V, isSigned);
 
@@ -1024,6 +1060,26 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
   return Slot.get();
 }
 
+// Get a ConstantFP vector with each lane set to the same APFloat.
+ConstantFP *ConstantFP::get(LLVMContext &Context, ElementCount EC,
+                            const APFloat &V) {
+  // Get an existing value or the insertion position.
+  std::unique_ptr<ConstantFP> &Slot =
+      Context.pImpl->FPSplatConstants[std::make_pair(EC, V)];
+  if (!Slot) {
+    Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics());
+    VectorType *VTy = VectorType::get(EltTy, EC);
+    Slot.reset(new ConstantFP(VTy, V));
+  }
+
+#ifndef NDEBUG
+  Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics());
+  VectorType *VTy = VectorType::get(EltTy, EC);
+  assert(Slot->getType() == VTy);
+#endif
+  return Slot.get();
+}
+
 Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
   const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics();
   Constant *C = get(Ty->getContext(), APFloat::getInf(Semantics, Negative));
@@ -1036,7 +1092,7 @@ Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
 
 ConstantFP::ConstantFP(Type *Ty, const APFloat &V)
     : ConstantData(Ty, ConstantFPVal), Val(V) {
-  assert(&V.getSemantics() == &Ty->getFltSemantics() &&
+  assert(&V.getSemantics() == &Ty->getScalarType()->getFltSemantics() &&
          "FP type Mismatch");
 }
 
@@ -1356,11 +1412,13 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   bool isZero = C->isNullValue();
   bool isUndef = isa<UndefValue>(C);
   bool isPoison = isa<PoisonValue>(C);
+  bool isSplatFP = UseConstantFPForFixedLengthSplat && isa<ConstantFP>(C);
+  bool isSplatInt = UseConstantIntForFixedLengthSplat && isa<ConstantInt>(C);
 
-  if (isZero || isUndef) {
+  if (isZero || isUndef || isSplatFP || isSplatInt) {
     for (unsigned i = 1, e = V.size(); i != e; ++i)
       if (V[i] != C) {
-        isZero = isUndef = isPoison = false;
+        isZero = isUndef = isPoison = isSplatFP = isSplatInt = false;
         break;
       }
   }
@@ -1371,6 +1429,12 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
     return PoisonValue::get(T);
   if (isUndef)
     return UndefValue::get(T);
+  if (isSplatFP)
+    return ConstantFP::get(C->getContext(), T->getElementCount(),
+                           cast<ConstantFP>(C)->getValue());
+  if (isSplatInt)
+    return ConstantInt::get(C->getContext(), T->getElementCount(),
+                            cast<ConstantInt>(C)->getValue());
 
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
@@ -1384,6 +1448,16 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
 
 Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
   if (!EC.isScalable()) {
+    // Maintain special handling of zero.
+    if (!V->isNullValue()) {
+      if (UseConstantIntForFixedLengthSplat && isa<ConstantInt>(V))
+        return ConstantInt::get(V->getContext(), EC,
+                                cast<ConstantInt>(V)->getValue());
+      if (UseConstantFPForFixedLengthSplat && isa<ConstantFP>(V))
+        return ConstantFP::get(V->getContext(), EC,
+                               cast<ConstantFP>(V)->getValue());
+    }
+
     // If this splat is compatible with ConstantDataVector, use it instead of
     // ConstantVector.
     if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
@@ -1394,6 +1468,16 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
     return get(Elts);
   }
 
+  // Maintain special handling of zero.
+  if (!V->isNullValue()) {
+    if (UseConstantIntForScalableSplat && isa<ConstantInt>(V))
+      return ConstantInt::get(V->getContext(), EC,
+                              cast<ConstantInt>(V)->getValue());
+    if (UseConstantFPForScalableSplat && isa<ConstantFP>(V))
+      return ConstantFP::get(V->getContext(), EC,
+                             cast<ConstantFP>(V)->getValue());
+  }
+
   Type *VTy = VectorType::get(V->getType(), EC);
 
   if (V->isNullValue())
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index eb18be5..389bac4 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -64,6 +64,9 @@ void DbgRecord::deleteRecord() {
   case ValueKind:
     delete cast<DPValue>(this);
     return;
+  case LabelKind:
+    delete cast<DPLabel>(this);
+    return;
   }
   llvm_unreachable("unsupported DbgRecord kind");
 }
@@ -73,6 +76,9 @@ void DbgRecord::print(raw_ostream &O, bool IsForDebug) const {
   case ValueKind:
     cast<DPValue>(this)->print(O, IsForDebug);
     return;
+  case LabelKind:
+    cast<DPLabel>(this)->print(O, IsForDebug);
+    return;
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
@@ -83,6 +89,9 @@ void DbgRecord::print(raw_ostream &O, ModuleSlotTracker &MST,
   case ValueKind:
     cast<DPValue>(this)->print(O, MST, IsForDebug);
     return;
+  case LabelKind:
+    cast<DPLabel>(this)->print(O, MST, IsForDebug);
+    return;
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
@@ -93,16 +102,23 @@ bool DbgRecord::isIdenticalToWhenDefined(const DbgRecord &R) const {
   switch (RecordKind) {
   case ValueKind:
     return cast<DPValue>(this)->isIdenticalToWhenDefined(*cast<DPValue>(&R));
+  case LabelKind:
+    return cast<DPLabel>(this)->getLabel() == cast<DPLabel>(R).getLabel();
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
 
 bool DbgRecord::isEquivalentTo(const DbgRecord &R) const {
-  if (RecordKind != R.RecordKind)
-    return false;
+  return getDebugLoc() == R.getDebugLoc() && isIdenticalToWhenDefined(R);
+}
+
+DbgInfoIntrinsic *
+DbgRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
   switch (RecordKind) {
   case ValueKind:
-    return cast<DPValue>(this)->isEquivalentTo(*cast<DPValue>(&R));
+    return cast<DPValue>(this)->createDebugIntrinsic(M, InsertBefore);
+  case LabelKind:
+    return cast<DPLabel>(this)->createDebugIntrinsic(M, InsertBefore);
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
@@ -307,12 +323,16 @@ DbgRecord *DbgRecord::clone() const {
   switch (RecordKind) {
   case ValueKind:
     return cast<DPValue>(this)->clone();
+  case LabelKind:
+    return cast<DPLabel>(this)->clone();
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
 
 DPValue *DPValue::clone() const { return new DPValue(*this); }
 
+DPLabel *DPLabel::clone() const { return new DPLabel(Label, getDebugLoc()); }
+
 DbgVariableIntrinsic *
 DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
   [[maybe_unused]] DICompileUnit *Unit =
@@ -368,6 +388,20 @@ DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
   return DVI;
 }
 
+DbgLabelInst *DPLabel::createDebugIntrinsic(Module *M,
+                                            Instruction *InsertBefore) const {
+  auto *LabelFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_label);
+  Value *Args[] = {
+      MetadataAsValue::get(getDebugLoc()->getContext(), getLabel())};
+  DbgLabelInst *DbgLabel = cast<DbgLabelInst>(
+      CallInst::Create(LabelFn->getFunctionType(), LabelFn, Args));
+  DbgLabel->setTailCall();
+  DbgLabel->setDebugLoc(getDebugLoc());
+  if (InsertBefore)
+    DbgLabel->insertBefore(InsertBefore);
+  return DbgLabel;
+}
+
 Value *DPValue::getAddress() const {
   auto *MD = getRawAddress();
   if (auto *V = dyn_cast<ValueAsMetadata>(MD))
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index ce0df53..fc5c9b2 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3525,6 +3525,7 @@ CastInst *CastInst::CreateFPCast(Value *C, Type *Ty,
          "Invalid cast");
   unsigned SrcBits = C->getType()->getScalarSizeInBits();
   unsigned DstBits = Ty->getScalarSizeInBits();
+  assert((C->getType() == Ty || SrcBits != DstBits) && "Invalid cast");
   Instruction::CastOps opcode =
     (SrcBits == DstBits ? Instruction::BitCast :
      (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt));
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 15c90a4..a0bf9ca 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -119,7 +119,9 @@ LLVMContextImpl::~LLVMContextImpl() {
   IntZeroConstants.clear();
   IntOneConstants.clear();
   IntConstants.clear();
+  IntSplatConstants.clear();
   FPConstants.clear();
+  FPSplatConstants.clear();
   CDSConstants.clear();
 
   // Destroy attribute node lists.
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 6a20291..2ee1080 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1488,8 +1488,12 @@ public:
   DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntZeroConstants;
   DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntOneConstants;
   DenseMap<APInt, std::unique_ptr<ConstantInt>> IntConstants;
+  DenseMap<std::pair<ElementCount, APInt>, std::unique_ptr<ConstantInt>>
+      IntSplatConstants;
 
   DenseMap<APFloat, std::unique_ptr<ConstantFP>> FPConstants;
+  DenseMap<std::pair<ElementCount, APFloat>, std::unique_ptr<ConstantFP>>
+      FPSplatConstants;
 
   FoldingSet<AttributeImpl> AttrsSet;
   FoldingSet<AttributeListImpl> AttrsLists;
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 7b3a759..6cfe677 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -330,8 +330,6 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
       report_fatal_error(Twine("unable to parse pass pipeline description '") +
                          Conf.OptPipeline + "': " + toString(std::move(Err)));
     }
-  } else if (Conf.UseDefaultPipeline) {
-    MPM.addPass(PB.buildPerModuleDefaultPipeline(OL));
   } else if (IsThinLTO) {
     MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
   } else {
diff --git a/llvm/lib/Object/SymbolSize.cpp b/llvm/lib/Object/SymbolSize.cpp
index cb20fef..635cd83 100644
--- a/llvm/lib/Object/SymbolSize.cpp
+++ b/llvm/lib/Object/SymbolSize.cpp
@@ -65,6 +65,13 @@ llvm::object::computeSymbolSizes(const ObjectFile &O) {
     return Ret;
   }
 
+  if (const auto *E = dyn_cast<WasmObjectFile>(&O)) {
+    for (SymbolRef Sym : E->symbols()) {
+      Ret.push_back({Sym, E->getSymbolSize(Sym)});
+    }
+    return Ret;
+  }
+
   // Collect sorted symbol addresses. Include dummy addresses for the end
   // of each section.
   std::vector<SymEntry> Addresses;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f26d95a..fed7a14 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -91,6 +91,7 @@
 #include "llvm/CodeGen/JMCInstrumenter.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
 #include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/SafeStack.h"
 #include "llvm/CodeGen/SelectOptimize.h"
 #include "llvm/CodeGen/ShadowStackGCLowering.h"
@@ -1260,6 +1261,28 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
 }
 
 template <typename CallbacksT>
+static bool isMachineFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
+  // Explicitly handle pass manager names.
+  if (Name == "machine-function")
+    return true;
+
+  // Explicitly handle custom-parsed pass names.
+  if (parseRepeatPassName(Name))
+    return true;
+
+#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)                               \
+  if (Name == NAME)                                                            \
+    return true;
+#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS)                           \
+  if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
+    return true;
+
+#include "llvm/Passes/MachinePassRegistry.def"
+
+  return callbacksAcceptPassName<MachineFunctionPassManager>(Name, Callbacks);
+}
+
+template <typename CallbacksT>
 static bool isLoopNestPassName(StringRef Name, CallbacksT &Callbacks,
                                bool &UseMemorySSA) {
   UseMemorySSA = false;
@@ -1394,6 +1417,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
       MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
       return Error::success();
     }
+    if (Name == "machine-function") {
+      MachineFunctionPassManager MFPM;
+      if (auto Err = parseMachinePassPipeline(MFPM, InnerPipeline))
+        return Err;
+      MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+      return Error::success();
+    }
     if (auto Params = parseFunctionPipelineName(Name)) {
       if (Params->second)
         return make_error<StringError>(
@@ -1874,8 +1904,8 @@ Error PassBuilder::parseMachinePass(MachineFunctionPassManager &MFPM,
   }
 #include "llvm/Passes/MachinePassRegistry.def"
 
-  for (auto &C : MachinePipelineParsingCallbacks)
-    if (C(Name, MFPM))
+  for (auto &C : MachineFunctionPipelineParsingCallbacks)
+    if (C(Name, MFPM, E.InnerPipeline))
       return Error::success();
   return make_error<StringError>(
       formatv("unknown machine pass '{0}'", Name).str(),
@@ -1942,7 +1972,8 @@ Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
 void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
                                        FunctionAnalysisManager &FAM,
                                        CGSCCAnalysisManager &CGAM,
-                                       ModuleAnalysisManager &MAM) {
+                                       ModuleAnalysisManager &MAM,
+                                       MachineFunctionAnalysisManager *MFAM) {
   MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });
   MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); });
   CGAM.registerPass([&] { return ModuleAnalysisManagerCGSCCProxy(MAM); });
@@ -1950,6 +1981,14 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
   FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
   FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); });
   LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
+  if (MFAM) {
+    MAM.registerPass(
+        [&] { return MachineFunctionAnalysisManagerModuleProxy(*MFAM); });
+    MFAM->registerPass(
+        [&] { return ModuleAnalysisManagerMachineFunctionProxy(MAM); });
+    MFAM->registerPass(
+        [&] { return FunctionAnalysisManagerMachineFunctionProxy(FAM); });
+  }
 }
 
 Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
@@ -1991,6 +2030,9 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
                               UseMemorySSA)) {
       Pipeline = {{"function", {{UseMemorySSA ? "loop-mssa" : "loop",
                                  std::move(*Pipeline)}}}};
+    } else if (isMachineFunctionPassName(
+                   FirstName, MachineFunctionPipelineParsingCallbacks)) {
+      Pipeline = {{"machine-function", std::move(*Pipeline)}};
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
         if (C(MPM, *Pipeline))
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 142bd50..17b55b6 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -247,7 +247,7 @@ static cl::opt<bool>
 
 static cl::opt<bool> EnableJumpTableToSwitch(
     "enable-jump-table-to-switch",
-    cl::desc("Enable JumpTableToSwitch pass (default = off)"));
+    cl::desc("Enable JumpTableToSwitch pass (default = on)"), cl::init(true));
 
 // This option is used in simplifying testing SampleFDO optimizations for
 // profile loading.
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index c62582a..a99856d 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -712,7 +712,7 @@ bool AArch64Arm64ECCallLowering::processFunction(
   // name (emitting the definition) can grab it from the metadata.
   //
   // FIXME: Handle functions with weak linkage?
-  if (F.hasExternalLinkage() || F.hasWeakLinkage() || F.hasLinkOnceLinkage()) {
+  if (!F.hasLocalLinkage() || F.hasAddressTaken()) {
     if (std::optional<std::string> MangledName =
             getArm64ECMangledFunctionName(F.getName().str())) {
       F.setMetadata("arm64ec_unmangled_name",
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 5b5ffd7..4fa719a 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1121,7 +1121,8 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
     TS->emitDirectiveVariantPCS(CurrentFnSym);
   }
 
-  if (TM.getTargetTriple().isWindowsArm64EC()) {
+  if (TM.getTargetTriple().isWindowsArm64EC() &&
+      !MF->getFunction().hasLocalLinkage()) {
     // For ARM64EC targets, a function definition's name is mangled differently
     // from the normal symbol. We emit the alias from the unmangled symbol to
     // mangled symbol name here.
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3485edb..5cc612e 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -239,11 +239,6 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
                                    cl::desc("enable use of redzone on AArch64"),
                                    cl::init(false), cl::Hidden);
 
-static cl::opt<bool>
-    ReverseCSRRestoreSeq("reverse-csr-restore-seq",
-                         cl::desc("reverse the CSR restore sequence"),
-                         cl::init(false), cl::Hidden);
-
 static cl::opt<bool> StackTaggingMergeSetTag(
     "stack-tagging-merge-settag",
     cl::desc("merge settag instruction in function epilog"), cl::init(true),
@@ -307,8 +302,6 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
     return false;
   if (!EnableHomogeneousPrologEpilog)
     return false;
-  if (ReverseCSRRestoreSeq)
-    return false;
   if (EnableRedZone)
     return false;
 
@@ -3117,7 +3110,27 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
 
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
 
-  auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
+  if (homogeneousPrologEpilog(MF, &MBB)) {
+    auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
+                   .setMIFlag(MachineInstr::FrameDestroy);
+    for (auto &RPI : RegPairs) {
+      MIB.addReg(RPI.Reg1, RegState::Define);
+      MIB.addReg(RPI.Reg2, RegState::Define);
+    }
+    return true;
+  }
+
+  // For performance reasons restore SVE register in increasing order
+  auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
+  auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
+  auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR);
+  std::reverse(PPRBegin, PPREnd);
+  auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
+  auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
+  auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
+  std::reverse(ZPRBegin, ZPREnd);
+
+  for (const RegPairInfo &RPI : RegPairs) {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
 
@@ -3191,42 +3204,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
         MachineMemOperand::MOLoad, Size, Alignment));
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
-
-    return MIB->getIterator();
-  };
-
-  // SVE objects are always restored in reverse order.
-  for (const RegPairInfo &RPI : reverse(RegPairs))
-    if (RPI.isScalable())
-      EmitMI(RPI);
-
-  if (homogeneousPrologEpilog(MF, &MBB)) {
-    auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
-                   .setMIFlag(MachineInstr::FrameDestroy);
-    for (auto &RPI : RegPairs) {
-      MIB.addReg(RPI.Reg1, RegState::Define);
-      MIB.addReg(RPI.Reg2, RegState::Define);
-    }
-    return true;
-  }
-
-  if (ReverseCSRRestoreSeq) {
-    MachineBasicBlock::iterator First = MBB.end();
-    for (const RegPairInfo &RPI : reverse(RegPairs)) {
-      if (RPI.isScalable())
-        continue;
-      MachineBasicBlock::iterator It = EmitMI(RPI);
-      if (First == MBB.end())
-        First = It;
-    }
-    if (First != MBB.end())
-      MBB.splice(MBBI, &MBB, First);
-  } else {
-    for (const RegPairInfo &RPI : RegPairs) {
-      if (RPI.isScalable())
-        continue;
-      (void)EmitMI(RPI);
-    }
   }
 
   return true;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 184ebc1..3b92e95 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -541,10 +541,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+  if (Subtarget->hasFPARMv8())
+    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
-  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+  if (Subtarget->hasFPARMv8())
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
 
@@ -947,9 +949,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 
-  setOperationAction(ISD::BITCAST, MVT::i16, Custom);
-  setOperationAction(ISD::BITCAST, MVT::f16, Custom);
-  setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+  if (Subtarget->hasFPARMv8()) {
+    setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+  }
 
   // Indexed loads and stores are supported.
   for (unsigned im = (unsigned)ISD::PRE_INC;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 436b21f..bec1348 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1308,6 +1308,8 @@ private:
   bool preferScalarizeSplat(SDNode *N) const override;
 
   unsigned getMinimumJumpTableEntries() const override;
+
+  bool softPromoteHalfType() const override { return true; }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
index 0ae9a69..1c577a2 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -419,10 +419,10 @@ def : InstRW<[TSV110Wr_12cyc_1MDU],  (instregex "^(S|U)DIVWr$")>;
 def : InstRW<[TSV110Wr_20cyc_1MDU],  (instregex "^(S|U)DIVXr$")>;
 
 def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>;
-def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
 def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>;
-def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
-def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def : InstRW<[TSV110Wr_4cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
 def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>;
 
 
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 961dded..ef7c517 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -520,7 +519,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   for (auto &I : SInfo.AllocasToInstrument) {
     memtag::AllocaInfo &Info = I.second;
     assert(Info.AI && SIB.isInterestingAlloca(*Info.AI));
-    TrackingVH<Instruction> OldAI = Info.AI;
     memtag::alignAndPadAlloca(Info, kTagGranuleSize);
     AllocaInst *AI = Info.AI;
     int Tag = NextTag;
@@ -534,7 +532,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
                               ConstantInt::get(IRB.getInt64Ty(), Tag)});
     if (Info.AI->hasName())
       TagPCall->setName(Info.AI->getName() + ".tag");
-    Info.AI->replaceAllUsesWith(TagPCall);
+    // Does not replace metadata, so we don't have to handle DPValues.
+    Info.AI->replaceNonMetadataUsesWith(TagPCall);
     TagPCall->setOperand(0, Info.AI);
 
     // Calls to functions that may return twice (e.g. setjmp) confuse the
@@ -574,12 +573,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       for (auto *II : Info.LifetimeEnd)
         II->eraseFromParent();
     }
-
-    // Fixup debug intrinsics to point to the new alloca.
-    for (auto *DVI : Info.DbgVariableIntrinsics)
-      DVI->replaceVariableLocationOp(OldAI, Info.AI);
-    for (auto *DPV : Info.DbgVariableRecords)
-      DPV->replaceVariableLocationOp(OldAI, Info.AI);
   }
 
   // If we have instrumented at least one alloca, all unrecognized lifetime
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6655931..010e569 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2972,6 +2972,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
 
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
                                          Op2Info);
+  case ISD::FREM:
+    // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
+    // those functions are not declared in the module.
+    if (!Ty->isVectorTy())
+      return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                         Op2Info);
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b9411e2..9218760 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,6 +33,12 @@ def rcp_sqrt_to_rsq : GICombineRule<
          [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
 
+def fdiv_by_sqrt_to_rsq_f16 : GICombineRule<
+  (defs root:$root),
+  (match (G_FSQRT f16:$sqrt, $x, (MIFlags FmContract)),
+         (G_FDIV f16:$dst, $y, $sqrt, (MIFlags FmContract)):$root,
+         [{ return matchFDivSqrtToRsqF16(*${root}); }]),
+  (apply [{ applyFDivSqrtToRsqF16(*${root}, ${x}.getReg()); }])>;
 
 def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
 
@@ -156,7 +162,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
   "AMDGPUPostLegalizerCombinerImpl",
   [all_combines, gfx6gfx7_combines, gfx8_combines,
    uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
-   rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
+   rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0d3b158..13d7510 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4824,9 +4824,8 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
   return true;
 }
 
-static const unsigned SPDenormModeBitField =
-    AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
-    (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+static constexpr unsigned SPDenormModeBitField =
+    AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
 
 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index a1c34e9..82e17dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,6 +83,9 @@ public:
   matchRcpSqrtToRsq(MachineInstr &MI,
                     std::function<void(MachineIRBuilder &)> &MatchInfo) const;
 
+  bool matchFDivSqrtToRsqF16(MachineInstr &MI) const;
+  void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const;
+
   // FIXME: Should be able to have 2 separate matchdatas rather than custom
   // struct boilerplate.
   struct CvtF32UByteMatchInfo {
@@ -334,6 +337,26 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
   return false;
 }
 
+bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
+    MachineInstr &MI) const {
+  Register Sqrt = MI.getOperand(2).getReg();
+  return MRI.hasOneNonDBGUse(Sqrt);
+}
+
+void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
+    MachineInstr &MI, const Register &X) const {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Y = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  uint32_t Flags = MI.getFlags();
+  Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
+                     .addUse(X)
+                     .setMIFlags(Flags)
+                     .getReg(0);
+  B.buildFMul(Dst, RSQ, Y, Flags);
+  MI.eraseFromParent();
+}
+
 bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
   Register SrcReg = MI.getOperand(1).getReg();
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5b32b34..b7b471d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7272,11 +7272,11 @@ ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
 
   if (trySkipId("hwreg", AsmToken::LParen)) {
     OperandInfoTy HwReg(OPR_ID_UNKNOWN);
-    OperandInfoTy Offset(OFFSET_DEFAULT_);
-    OperandInfoTy Width(WIDTH_DEFAULT_);
+    OperandInfoTy Offset(HwregOffset::Default);
+    OperandInfoTy Width(HwregSize::Default);
     if (parseHwregBody(HwReg, Offset, Width) &&
         validateHwreg(HwReg, Offset, Width)) {
-      ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id);
+      ImmVal = HwregEncoding::encode(HwReg.Id, Offset.Id, Width.Id);
     } else {
       return ParseStatus::Failure;
     }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 894607d..e1cca17 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -119,6 +119,12 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
   return addOperand(Inst, DAsm->decodeSplitBarrier(Val));
 }
 
+static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr,
+                                 const MCDisassembler *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeDpp8FI(Val));
+}
+
 #define DECODE_OPERAND(StaticDecoderName, DecoderName)                         \
   static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm,            \
                                         uint64_t /*Addr*/,                     \
@@ -440,19 +446,6 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
   return DecoderUInt128(Lo, Hi);
 }
 
-// The disassembler is greedy, so we need to check FI operand value to
-// not parse a dpp if the correct literal is not set. For dpp16 the
-// autogenerated decoder checks the dpp literal
-static bool isValidDPP8(const MCInst &MI) {
-  using namespace llvm::AMDGPU::DPP;
-  int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
-  assert(FiIdx != -1);
-  if ((unsigned)FiIdx >= MI.getNumOperands())
-    return false;
-  unsigned Fi = MI.getOperand(FiIdx).getImm();
-  return Fi == DPP8_FI_0 || Fi == DPP8_FI_1;
-}
-
 DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                                 ArrayRef<uint8_t> Bytes_,
                                                 uint64_t Address,
@@ -460,7 +453,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
   Bytes = Bytes_.slice(0, MaxInstBytesNum);
 
-  DecodeStatus Res = MCDisassembler::Fail;
+  // In case the opcode is not recognized we'll assume a Size of 4 bytes (unless
+  // there are fewer bytes left). This will be overridden on success.
+  Size = std::min((size_t)4, Bytes_.size());
+
   do {
     // ToDo: better to switch encoding length using some bit predicate
     // but it is unknown yet, so try all we can
@@ -469,222 +465,147 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // encodings
     if (isGFX11Plus() && Bytes.size() >= 12 ) {
       DecoderUInt128 DecW = eat12Bytes(Bytes);
-      Res =
-          tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696,
-                        MI, DecW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
-        break;
-      MI = MCInst(); // clear
-      Res =
-          tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696,
-                        MI, DecW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
-        break;
-      MI = MCInst(); // clear
-
-      const auto convertVOPDPP = [&]() {
-        if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) {
-          convertVOP3PDPPInst(MI);
-        } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) {
-          convertVOPCDPPInst(MI); // Special VOP3 case
-        } else {
-          assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
-          convertVOP3DPPInst(MI); // Regular VOP3 case
-        }
-      };
-      Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696,
-                          MI, DecW, Address, CS);
-      if (Res) {
-        convertVOPDPP();
-        break;
-      }
-      Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696,
-                          MI, DecW, Address, CS);
-      if (Res) {
-        convertVOPDPP();
-        break;
-      }
-      Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS);
-      if (Res)
+
+      if (tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
+                        DecW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
+                        DecW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
         break;
     }
+
     // Reinitialize Bytes
     Bytes = Bytes_.slice(0, MaxInstBytesNum);
 
     if (Bytes.size() >= 8) {
       const uint64_t QW = eatBytes<uint64_t>(Bytes);
 
-      if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
-        Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS);
-        if (Res) {
-          if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
-              == -1)
-            break;
-          if (convertDPP8Inst(MI) == MCDisassembler::Success)
-            break;
-          MI = MCInst(); // clear
-        }
-      }
-
-      Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
-        break;
-      MI = MCInst(); // clear
-
-      Res = tryDecodeInst(DecoderTableDPP8GFX1164,
-                          DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
-        break;
-      MI = MCInst(); // clear
-
-      Res = tryDecodeInst(DecoderTableDPP8GFX1264,
-                          DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+      if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
+          tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS))
         break;
-      MI = MCInst(); // clear
 
-      Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
-      if (Res) break;
-
-      Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664,
-                          MI, QW, Address, CS);
-      if (Res) {
-        if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
-          convertVOPCDPPInst(MI);
-        break;
-      }
-
-      Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664,
-                          MI, QW, Address, CS);
-      if (Res) {
-        if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
-          convertVOPCDPPInst(MI);
+      if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) &&
+          tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS))
         break;
-      }
-
-      if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) {
-        Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS);
-        if (Res)
-          break;
-      }
 
       // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
       // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
       // table first so we print the correct name.
-      if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) {
-        Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS);
-        if (Res)
-          break;
-      }
+      if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts) &&
+          tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS))
+        break;
 
-      if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) {
-        Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS);
-        if (Res)
-          break;
-      }
+      if (STI.hasFeature(AMDGPU::FeatureGFX940Insts) &&
+          tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS))
+        break;
 
-      if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
-        Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS);
-        if (Res)
-          break;
-      }
+      if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
+          tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS))
+        break;
 
-      Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI,
-                          QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
+                        Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI,
-                          QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
+                        Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
         break;
     }
 
-    // Reinitialize Bytes as DPP64 could have eaten too much
+    // Reinitialize Bytes
     Bytes = Bytes_.slice(0, MaxInstBytesNum);
 
     // Try decode 32-bit instruction
-    if (Bytes.size() < 4) break;
-    const uint32_t DW = eatBytes<uint32_t>(Bytes);
-    Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS);
-    if (Res) break;
+    if (Bytes.size() >= 4) {
+      const uint32_t DW = eatBytes<uint32_t>(Bytes);
 
-    Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS);
-    if (Res) break;
+      if (tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
+        break;
 
-    Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS);
-    if (Res) break;
+      if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS))
+        break;
 
-    if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
-      Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
         break;
-    }
 
-    if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
-      Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS);
-      if (Res) break;
-    }
+      if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
+          tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS))
+        break;
+
+      if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
+          tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS))
+        break;
 
-    Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS);
-    if (Res) break;
+      if (tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
+        break;
 
-    Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
-                        Address, CS);
-    if (Res) break;
+      if (tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
+                        Address, CS))
+        break;
 
-    Res = tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
-                        Address, CS);
+      if (tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
+                        Address, CS))
+        break;
+    }
+
+    return MCDisassembler::Fail;
   } while (false);
 
-  if (Res && AMDGPU::isMAC(MI.getOpcode())) {
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP) {
+    if (isMacDPP(MI))
+      convertMacDPPInst(MI);
+
+    if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
+      convertVOP3PDPPInst(MI);
+    else if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) ||
+             AMDGPU::isVOPC64DPP(MI.getOpcode()))
+      convertVOPCDPPInst(MI); // Special VOP3 case
+    else if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) !=
+             -1)
+      convertDPP8Inst(MI);
+    else if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3)
+      convertVOP3DPPInst(MI); // Regular VOP3 case
+  }
+
+  if (AMDGPU::isMAC(MI.getOpcode())) {
     // Insert dummy unused src2_modifiers.
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src2_modifiers);
   }
 
-  if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
-              MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
+  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
+      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp) {
     // Insert dummy unused src2_modifiers.
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src2_modifiers);
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
+  if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
       !AMDGPU::hasGDS(STI)) {
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
-          (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
+  if (MCII->get(MI.getOpcode()).TSFlags &
+      (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD)) {
     int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                              AMDGPU::OpName::cpol);
     if (CPolPos != -1) {
@@ -700,9 +621,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
-              (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
-             (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
+  if ((MCII->get(MI.getOpcode()).TSFlags &
+       (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
+      (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
     // GFX90A lost TFE, its place is occupied by ACC.
     int TFEOpIdx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
@@ -713,8 +634,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
-              (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
+  if (MCII->get(MI.getOpcode()).TSFlags &
+      (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) {
     int SWZOpIdx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
     if (SWZOpIdx != -1) {
@@ -724,7 +645,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) {
     int VAddr0Idx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
     int RsrcIdx =
@@ -732,36 +653,32 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
     if (VAddr0Idx >= 0 && NSAArgs > 0) {
       unsigned NSAWords = (NSAArgs + 3) / 4;
-      if (Bytes.size() < 4 * NSAWords) {
-        Res = MCDisassembler::Fail;
-      } else {
-        for (unsigned i = 0; i < NSAArgs; ++i) {
-          const unsigned VAddrIdx = VAddr0Idx + 1 + i;
-          auto VAddrRCID =
-              MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
-          MI.insert(MI.begin() + VAddrIdx,
-                    createRegOperand(VAddrRCID, Bytes[i]));
-        }
-        Bytes = Bytes.slice(4 * NSAWords);
+      if (Bytes.size() < 4 * NSAWords)
+        return MCDisassembler::Fail;
+      for (unsigned i = 0; i < NSAArgs; ++i) {
+        const unsigned VAddrIdx = VAddr0Idx + 1 + i;
+        auto VAddrRCID =
+            MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
+        MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i]));
       }
+      Bytes = Bytes.slice(4 * NSAWords);
     }
 
-    if (Res)
-      Res = convertMIMGInst(MI);
+    convertMIMGInst(MI);
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
-              (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE)))
-    Res = convertMIMGInst(MI);
+  if (MCII->get(MI.getOpcode()).TSFlags &
+      (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE))
+    convertMIMGInst(MI);
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
-    Res = convertEXPInst(MI);
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)
+    convertEXPInst(MI);
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP))
-    Res = convertVINTERPInst(MI);
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP)
+    convertVINTERPInst(MI);
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA))
-    Res = convertSDWAInst(MI);
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA)
+    convertSDWAInst(MI);
 
   int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                               AMDGPU::OpName::vdst_in);
@@ -782,27 +699,23 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   int ImmLitIdx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
   bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK;
-  if (Res && ImmLitIdx != -1 && !IsSOPK)
-    Res = convertFMAanyK(MI, ImmLitIdx);
+  if (ImmLitIdx != -1 && !IsSOPK)
+    convertFMAanyK(MI, ImmLitIdx);
 
-  // if the opcode was not recognized we'll assume a Size of 4 bytes
-  // (unless there are fewer bytes left)
-  Size = Res ? (MaxInstBytesNum - Bytes.size())
-             : std::min((size_t)4, Bytes_.size());
-  return Res;
+  Size = MaxInstBytesNum - Bytes.size();
+  return MCDisassembler::Success;
 }
 
-DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
   if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) {
     // The MCInst still has these fields even though they are no longer encoded
     // in the GFX11 instruction.
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
   }
-  return MCDisassembler::Success;
 }
 
-DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
   if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
       MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 ||
       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
@@ -815,10 +728,9 @@ DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
     // instruction.
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
   }
-  return MCDisassembler::Success;
 }
 
-DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
       STI.hasFeature(AMDGPU::FeatureGFX10)) {
     if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
@@ -835,7 +747,6 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
     }
   }
-  return MCDisassembler::Success;
 }
 
 struct VOPModifiers {
@@ -939,56 +850,40 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
                        AMDGPU::OpName::src2_modifiers);
 }
 
-// We must check FI == literal to reject not genuine dpp8 insts, and we must
-// first add optional MI operands to check FI
-DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
+void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
 
-  if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
-    convertVOP3PDPPInst(MI);
-  } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
-             AMDGPU::isVOPC64DPP(Opc)) {
-    convertVOPCDPPInst(MI);
-  } else {
-    if (isMacDPP(MI))
-      convertMacDPPInst(MI);
+  int VDstInIdx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+  if (VDstInIdx != -1)
+    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
 
-    int VDstInIdx =
-        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
-    if (VDstInIdx != -1)
-      insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
+    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
 
-    if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
-        MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
-      insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+  unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+    convertTrue16OpSel(MI);
+    auto Mods = collectVOPModifiers(MI);
+    insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
+                         AMDGPU::OpName::op_sel);
+  } else {
+    // Insert dummy unused src modifiers.
+    if (MI.getNumOperands() < DescNumOps &&
+        AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
+      insertNamedMCOperand(MI, MCOperand::createImm(0),
+                           AMDGPU::OpName::src0_modifiers);
 
-    unsigned DescNumOps = MCII->get(Opc).getNumOperands();
     if (MI.getNumOperands() < DescNumOps &&
-        AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
-      convertTrue16OpSel(MI);
-      auto Mods = collectVOPModifiers(MI);
-      insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
-                           AMDGPU::OpName::op_sel);
-    } else {
-      // Insert dummy unused src modifiers.
-      if (MI.getNumOperands() < DescNumOps &&
-          AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
-        insertNamedMCOperand(MI, MCOperand::createImm(0),
-                             AMDGPU::OpName::src0_modifiers);
-
-      if (MI.getNumOperands() < DescNumOps &&
-          AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
-        insertNamedMCOperand(MI, MCOperand::createImm(0),
-                             AMDGPU::OpName::src1_modifiers);
-    }
+        AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
+      insertNamedMCOperand(MI, MCOperand::createImm(0),
+                           AMDGPU::OpName::src1_modifiers);
   }
-  return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
 }
 
-DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
-  if (isMacDPP(MI))
-    convertMacDPPInst(MI);
-
+void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   convertTrue16OpSel(MI);
 
   int VDstInIdx =
@@ -1008,13 +903,12 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
                          AMDGPU::OpName::op_sel);
   }
-  return MCDisassembler::Success;
 }
 
 // Note that before gfx10, the MIMG encoding provided no information about
 // VADDR size. Consequently, decoded instructions always show address as if it
 // has 1 dword, which could be not really so.
-DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   auto TSFlags = MCII->get(MI.getOpcode()).TSFlags;
 
   int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
@@ -1043,7 +937,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   if (BaseOpcode->BVH) {
     // Add A16 operand for intersect_ray instructions
     addOperand(MI, MCOperand::createImm(BaseOpcode->A16));
-    return MCDisassembler::Success;
+    return;
   }
 
   bool IsAtomic = (VDstIdx != -1);
@@ -1078,7 +972,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
         if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) {
           // The NSA encoding does not contain enough operands for the
           // combination of base opcode / dimension. Should this be an error?
-          return MCDisassembler::Success;
+          return;
         }
         IsPartialNSA = true;
       }
@@ -1097,12 +991,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     DstSize += 1;
 
   if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
-    return MCDisassembler::Success;
+    return;
 
   int NewOpcode =
       AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
   if (NewOpcode == -1)
-    return MCDisassembler::Success;
+    return;
 
   // Widen the register to the correct number of enabled channels.
   unsigned NewVdata = AMDGPU::NoRegister;
@@ -1119,7 +1013,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     if (NewVdata == AMDGPU::NoRegister) {
       // It's possible to encode this such that the low register + enabled
       // components exceeds the register count.
-      return MCDisassembler::Success;
+      return;
     }
   }
 
@@ -1137,7 +1031,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
                                         &MRI.getRegClass(AddrRCID));
     if (!NewVAddrSA)
-      return MCDisassembler::Success;
+      return;
   }
 
   MI.setOpcode(NewOpcode);
@@ -1158,14 +1052,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     MI.erase(MI.begin() + VAddr0Idx + AddrSize,
              MI.begin() + VAddr0Idx + Info->VAddrDwords);
   }
-
-  return MCDisassembler::Success;
 }
 
 // Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
 // decoder only adds to src_modifiers, so manually add the bits to the other
 // operands.
-DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
   auto Mods = collectVOPModifiers(MI, true);
@@ -1190,12 +1082,10 @@ DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi))
     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
                          AMDGPU::OpName::neg_hi);
-
-  return MCDisassembler::Success;
 }
 
 // Create dummy old operand and insert optional operands
-DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
 
@@ -1212,11 +1102,9 @@ DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src1_modifiers);
-  return MCDisassembler::Success;
 }
 
-DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
-                                                int ImmLitIdx) const {
+void AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const {
   assert(HasLiteral && "Should have decoded a literal");
   const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
   unsigned DescNumOps = Desc.getNumOperands();
@@ -1232,7 +1120,6 @@ DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
         IsDeferredOp)
       Op.setImm(Literal);
   }
-  return MCDisassembler::Success;
 }
 
 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
@@ -1831,6 +1718,12 @@ MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const {
   return decodeSrcOp(OPW32, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const {
+  if (Val != AMDGPU::DPP::DPP8_FI_0 && Val != AMDGPU::DPP::DPP8_FI_1)
+    return MCOperand();
+  return MCOperand::createImm(Val);
+}
+
 bool AMDGPUDisassembler::isVI() const {
   return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
 }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 3142b8a..2e1b6fb 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -194,15 +194,15 @@ public:
   DecodeStatus decodeCOMPUTE_PGM_RSRC3(uint32_t FourByteBuffer,
                                        raw_string_ostream &KdStream) const;
 
-  DecodeStatus convertEXPInst(MCInst &MI) const;
-  DecodeStatus convertVINTERPInst(MCInst &MI) const;
-  DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
-  DecodeStatus convertSDWAInst(MCInst &MI) const;
-  DecodeStatus convertDPP8Inst(MCInst &MI) const;
-  DecodeStatus convertMIMGInst(MCInst &MI) const;
-  DecodeStatus convertVOP3DPPInst(MCInst &MI) const;
-  DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
-  DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
+  void convertEXPInst(MCInst &MI) const;
+  void convertVINTERPInst(MCInst &MI) const;
+  void convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
+  void convertSDWAInst(MCInst &MI) const;
+  void convertDPP8Inst(MCInst &MI) const;
+  void convertMIMGInst(MCInst &MI) const;
+  void convertVOP3DPPInst(MCInst &MI) const;
+  void convertVOP3PDPPInst(MCInst &MI) const;
+  void convertVOPCDPPInst(MCInst &MI) const;
   void convertMacDPPInst(MCInst &MI) const;
   void convertTrue16OpSel(MCInst &MI) const;
 
@@ -261,6 +261,7 @@ public:
 
   MCOperand decodeBoolReg(unsigned Val) const;
   MCOperand decodeSplitBarrier(unsigned Val) const;
+  MCOperand decodeDpp8FI(unsigned Val) const;
 
   int getTTmpIdx(unsigned Val) const;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a727134..00fa93c 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -177,7 +177,7 @@ static bool isLdsDma(const MachineInstr &MI) {
 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
                                                      AMDGPU::OpName::simm16);
-  return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
+  return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
 }
 
 ScheduleHazardRecognizer::HazardType
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a45fea6..a32be1e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1778,13 +1778,9 @@ void AMDGPUInstPrinter::printSDelayALU(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
                                    const MCSubtargetInfo &STI, raw_ostream &O) {
-  unsigned Id;
-  unsigned Offset;
-  unsigned Width;
-
   using namespace llvm::AMDGPU::Hwreg;
   unsigned Val = MI->getOperand(OpNo).getImm();
-  decodeHwreg(Val, Id, Offset, Width);
+  auto [Id, Offset, Width] = HwregEncoding::decode(Val);
   StringRef HwRegName = getHwreg(Id, STI);
 
   O << "hwreg(";
@@ -1793,9 +1789,8 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
   } else {
     O << Id;
   }
-  if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) {
+  if (Width != HwregSize::Default || Offset != HwregOffset::Default)
     O << ", " << Offset << ", " << Width;
-  }
   O << ')';
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 98310c3..0b516bf 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -549,33 +549,12 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
   ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
   ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
-
-  ID_SHIFT_ = 0,
-  ID_WIDTH_ = 6,
-  ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
 };
 
 enum Offset : unsigned { // Offset, (5) [10:6]
-  OFFSET_DEFAULT_ = 0,
-  OFFSET_SHIFT_ = 6,
-  OFFSET_WIDTH_ = 5,
-  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
-
   OFFSET_MEM_VIOL = 8,
 };
 
-enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11]
-  WIDTH_M1_DEFAULT_ = 31,
-  WIDTH_M1_SHIFT_ = 11,
-  WIDTH_M1_WIDTH_ = 5,
-  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_),
-};
-
-// Some values from WidthMinusOne mapped into Width domain.
-enum Width : unsigned {
-  WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1,
-};
-
 enum ModeRegisterMasks : uint32_t {
   FP_ROUND_MASK = 0xf << 0,  // Bits 0..3
   FP_DENORM_MASK = 0xf << 4, // Bits 4..7
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index d02aee7..4f106bf 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -478,14 +478,13 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         .addImm(0);
       Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
 
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
-        addReg(FlatScrInitLo).
-        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
-                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
-        addReg(FlatScrInitHi).
-        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
-                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
+      using namespace AMDGPU::Hwreg;
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
+          .addReg(FlatScrInitLo)
+          .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
+          .addReg(FlatScrInitHi)
+          .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
       return;
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 257dff6..d8f528d8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3960,7 +3960,7 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
   assert(Op.getValueType() == MVT::i32);
 
   uint32_t BothRoundHwReg =
-      AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
+      AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
   SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
 
   SDValue IntrinID =
@@ -4195,8 +4195,8 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
 
   MachineBasicBlock::iterator I = LoopBB->end();
 
-  const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
-    AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
+  const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
+      AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
 
   // Clear TRAP_STS.MEM_VIOL
   BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
@@ -4999,18 +4999,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     // Otherwise there was overflow and the result is hi2:0. In both cases the
     // result should represent the actual time at some point during the sequence
     // of three getregs.
+    using namespace AMDGPU::Hwreg;
     Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
-        .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
-                                           0, 32));
+        .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
     Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
-        .addImm(
-            AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32));
+        .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
     Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
-        .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
-                                           0, 32));
+        .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
         .addReg(RegHi1)
         .addReg(RegHi2);
@@ -5207,8 +5205,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     // FIXME: This could be predicates on the immediate, but tablegen doesn't
     // allow you to have a no side effect instruction in the output of a
     // sideeffecting pattern.
-    unsigned ID, Offset, Width;
-    AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
+    auto [ID, Offset, Width] =
+        AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
     if (ID != AMDGPU::Hwreg::ID_MODE)
       return BB;
 
@@ -10495,9 +10493,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
                                      DenominatorScaled, Flags);
 
-  const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
-                               (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
-                               (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+  using namespace AMDGPU::Hwreg;
+  const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
 
   const MachineFunction &MF = DAG.getMachineFunction();
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6ecb1c8..a6184c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -480,6 +480,10 @@ public:
   // WaitEventType to corresponding counter values in InstCounterType.
   virtual const unsigned *getWaitEventMask() const = 0;
 
+  // Returns a new waitcnt with all counters except VScnt set to 0. If
+  // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
+  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
+
   virtual ~WaitcntGenerator() = default;
 };
 
@@ -516,6 +520,8 @@ public:
 
     return WaitEventMaskForInstPreGFX12;
   }
+
+  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
 };
 
 class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
@@ -549,6 +555,8 @@ public:
 
     return WaitEventMaskForInstGFX12Plus;
   }
+
+  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
 };
 
 class SIInsertWaitcnts : public MachineFunctionPass {
@@ -1304,6 +1312,16 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
   return Modified;
 }
 
+AMDGPU::Waitcnt
+WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
+}
+
+AMDGPU::Waitcnt
+WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+}
+
 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
 /// were added by previous passes. Currently this pass conservatively
@@ -1613,8 +1631,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       MI.getOpcode() == AMDGPU::SI_RETURN ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
-    Wait = Wait.combined(
-        AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
+    Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
   }
   // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
   // stores. In this case it can be useful to send a message to explicitly
@@ -1834,8 +1851,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
       !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
-    Wait = Wait.combined(
-        AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt()));
+    Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1851,7 +1867,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   ScoreBrackets.simplifyWaitcnt(Wait);
 
   if (ForceEmitZeroWaitcnts)
-    Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts());
+    Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
 
   if (ForceEmitWaitcnt[LOAD_CNT])
     Wait.LoadCnt = 0;
@@ -2089,7 +2105,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     if (callWaitsOnFunctionReturn(Inst)) {
       // Act as a wait on everything
       ScoreBrackets->applyWaitcnt(
-          AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
+          WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
       ScoreBrackets->setStateOnFunctionEntryOrReturn();
     } else {
       // May need to way wait for anything.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 97c7237..34cdb09 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -987,8 +987,8 @@ def SDWAVopcDst : BoolRC {
 }
 
 class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
-                      string ConvertMethod = "nullptr">
-    : CustomOperand<Type, Optional, NAME> {
+                      string name = NAME, string ConvertMethod = "nullptr">
+    : CustomOperand<Type, Optional, name> {
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
     "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
@@ -1090,9 +1090,12 @@ let DefaultValue = "0xf" in {
 def DppRowMask : NamedIntOperand<i32, "row_mask">;
 def DppBankMask : NamedIntOperand<i32, "bank_mask">;
 }
-def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1,
+def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl",
     "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">;
-def DppFI : NamedIntOperand<i32, "fi">;
+
+let DecoderMethod = "decodeDpp8FI" in
+def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
+def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 
 def blgp : CustomOperand<i32, 1, "BLGP">;
 def CBSZ : NamedIntOperand<i32, "cbsz">;
@@ -1823,7 +1826,7 @@ class getInsDPP16 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperan
                    Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
                            HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
-                 (ins DppFI:$fi));
+                 (ins Dpp16FI:$fi));
 }
 
 class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
@@ -1831,7 +1834,7 @@ class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand
                   Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
                            HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
-                 (ins dpp8:$dpp8, DppFI:$fi));
+                 (ins dpp8:$dpp8, Dpp8FI:$fi));
 }
 
 class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld> {
@@ -1851,12 +1854,12 @@ class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit Has
 
 class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
   dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
-                 (ins DppFI:$fi));
+                 (ins Dpp16FI:$fi));
 }
 
 class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
   dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
-                 (ins dpp8:$dpp8, DppFI:$fi));
+                 (ins dpp8:$dpp8, Dpp8FI:$fi));
 }
 
 // Ins for SDWA
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index e62ad02..c01b126 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -225,11 +225,10 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
     unsigned Offset = llvm::countr_zero<unsigned>(InstrMode.Mask);
     unsigned Width = llvm::countr_one<unsigned>(InstrMode.Mask >> Offset);
     unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
+    using namespace AMDGPU::Hwreg;
     BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32))
         .addImm(Value)
-        .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
-                (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
-                (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
+        .addImm(HwregEncoding::encode(ID_MODE, Offset, Width));
     ++NumSetregInserted;
     Changed = true;
     InstrMode.Mask &= ~(((1 << Width) - 1) << Offset);
@@ -276,15 +275,11 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
       // as we assume it has been inserted by a higher authority (this is
       // likely to be a very rare occurrence).
       unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
-      if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) !=
-          AMDGPU::Hwreg::ID_MODE)
+      using namespace AMDGPU::Hwreg;
+      auto [Id, Offset, Width] = HwregEncoding::decode(Dst);
+      if (Id != ID_MODE)
         continue;
 
-      unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >>
-                        AMDGPU::Hwreg::WIDTH_M1_SHIFT_) +
-                       1;
-      unsigned Offset =
-          (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_;
       unsigned Mask = maskTrailingOnes<unsigned>(Width) << Offset;
 
       // If an InsertionPoint is set we will insert a setreg there.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index dacdf7b..ce91e05 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1698,22 +1698,14 @@ int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
   return (Idx < 0) ? Idx : Opr[Idx].Encoding;
 }
 
-bool isValidHwreg(int64_t Id) {
-  return 0 <= Id && isUInt<ID_WIDTH_>(Id);
-}
+bool isValidHwreg(int64_t Id) { return 0 <= Id && isUInt<HwregId::Width>(Id); }
 
 bool isValidHwregOffset(int64_t Offset) {
-  return 0 <= Offset && isUInt<OFFSET_WIDTH_>(Offset);
+  return 0 <= Offset && isUInt<HwregOffset::Width>(Offset);
 }
 
 bool isValidHwregWidth(int64_t Width) {
-  return 0 <= (Width - 1) && isUInt<WIDTH_M1_WIDTH_>(Width - 1);
-}
-
-uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
-  return (Id << ID_SHIFT_) |
-         (Offset << OFFSET_SHIFT_) |
-         ((Width - 1) << WIDTH_M1_SHIFT_);
+  return 0 <= (Width - 1) && isUInt<HwregSize::Width>(Width - 1);
 }
 
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
@@ -1721,12 +1713,6 @@ StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
   return (Idx < 0) ? "" : Opr[Idx].Name;
 }
 
-void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) {
-  Id = (Val & ID_MASK_) >> ID_SHIFT_;
-  Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_;
-  Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
-}
-
 } // namespace Hwreg
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f35e7744..6826cd2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -322,6 +322,35 @@ getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs,
 
 } // end namespace IsaInfo
 
+// Represents a field in an encoded value.
+template <unsigned HighBit, unsigned LowBit, unsigned D = 0>
+struct EncodingField {
+  static_assert(HighBit >= LowBit, "Invalid bit range!");
+  static constexpr unsigned Offset = LowBit;
+  static constexpr unsigned Width = HighBit - LowBit + 1;
+
+  using ValueType = unsigned;
+  static constexpr ValueType Default = D;
+
+  ValueType Value;
+  constexpr EncodingField(ValueType Value) : Value(Value) {}
+
+  constexpr uint64_t encode() const { return Value; }
+  static ValueType decode(uint64_t Encoded) { return Encoded; }
+};
+
+// A helper for encoding and decoding multiple fields.
+template <typename... Fields> struct EncodingFields {
+  static constexpr uint64_t encode(Fields... Values) {
+    return ((Values.encode() << Values.Offset) | ...);
+  }
+
+  static std::tuple<typename Fields::ValueType...> decode(uint64_t Encoded) {
+    return {Fields::decode((Encoded >> Fields::Offset) &
+                           maxUIntN(Fields::Width))...};
+  }
+};
+
 LLVM_READONLY
 int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
 
@@ -870,15 +899,6 @@ struct Waitcnt {
       : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
         SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
 
-  static Waitcnt allZero(bool Extended, bool HasStorecnt) {
-    return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0)
-                    : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u);
-  }
-
-  static Waitcnt allZeroExceptVsCnt(bool Extended) {
-    return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u);
-  }
-
   bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
 
   bool hasWaitExceptStoreCnt() const {
@@ -1030,6 +1050,17 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
 
 namespace Hwreg {
 
+using HwregId = EncodingField<5, 0>;
+using HwregOffset = EncodingField<10, 6>;
+
+struct HwregSize : EncodingField<15, 11, 32> {
+  using EncodingField::EncodingField;
+  constexpr uint64_t encode() const { return Value - 1; }
+  static ValueType decode(uint64_t Encoded) { return Encoded + 1; }
+};
+
+using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>;
+
 LLVM_READONLY
 int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
 
@@ -1043,13 +1074,8 @@ LLVM_READNONE
 bool isValidHwregWidth(int64_t Width);
 
 LLVM_READNONE
-uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width);
-
-LLVM_READNONE
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
 
-void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
-
 } // namespace Hwreg
 
 namespace DepCtr {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 99f8e8e..f5424cf 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -380,9 +380,9 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un
   let OutsDPP = (outs Src0RC32:$vdst);
   let InsDPP16 = (ins Src0RC32:$old, Src0RC32:$src0,
                       dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
-                      DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+                      DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
   let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret;
-  let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, DppFI:$fi);
+  let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, Dpp8FI:$fi);
   let AsmDPP8 = getAsmDPP8<1, 1, 0>.ret;
 
   let OutsVOP3DPP = (outs Src0RC64:$vdst);
@@ -749,7 +749,7 @@ class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = p
 class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
     VOP1_DPP16 <op, ps, Gen.Subtarget, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
-  let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+  let DecoderNamespace = Gen.DecoderNamespace;
 }
 
 class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -770,7 +770,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
 class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
     VOP1_DPP8<op, ps, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
-  let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+  let DecoderNamespace = Gen.DecoderNamespace;
 }
 
 //===----------------------------------------------------------------------===//
@@ -816,7 +816,7 @@ multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName,
                                    string asmName> {
   defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
   let AsmString = asmName # ps.Pfl.AsmDPP16,
-      DecoderNamespace = "DPP" # Gen.DecoderNamespace #
+      DecoderNamespace = Gen.DecoderNamespace #
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
     defm NAME : VOP1_Real_dpp<Gen, op, opName>;
   }
@@ -831,7 +831,7 @@ multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
                                     string asmName> {
   defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
   let AsmString = asmName # ps.Pfl.AsmDPP8,
-      DecoderNamespace = "DPP8" # Gen.DecoderNamespace #
+      DecoderNamespace = Gen.DecoderNamespace #
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
     defm NAME : VOP1_Real_dpp8<Gen, op, opName>;
   }
@@ -994,9 +994,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   }
   multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
     if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
-    def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
-      let DecoderNamespace = "DPP8";
-    }
+    def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
   }
 } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
@@ -1192,16 +1190,14 @@ class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
   let Inst{31-25} = 0x3f; //encoding
 }
 
-multiclass VOP1Only_Real_vi <bits<10> op> {
-  let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+  multiclass VOP1Only_Real_vi <bits<10> op> {
     def _vi :
       VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>,
       VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
   }
-}
 
-multiclass VOP1_Real_e32e64_vi <bits<10> op> {
-  let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+  multiclass VOP1_Real_e32e64_vi <bits<10> op> {
     def _e32_vi :
       VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
       VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
@@ -1389,44 +1385,41 @@ def : GCNPat <
 // GFX9
 //===----------------------------------------------------------------------===//
 
-multiclass VOP1_Real_gfx9 <bits<10> op> {
-  let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+  multiclass VOP1_Real_gfx9 <bits<10> op> {
     defm NAME : VOP1_Real_e32e64_vi <op>;
-  }
-
-  if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
-  def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
-    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
-
-  if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
-    def _dpp_gfx9 :
-      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
-      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
-
-}
 
-multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
-  let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
-    defm NAME : VOP1_Real_e32e64_vi <op>;
+    if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
+    def _sdwa_gfx9 :
+      VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+      VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+    if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+      def _dpp_gfx9 :
+        VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+        VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
   }
 
-  if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
-  def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
-    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
-      let Inst{42-40} = 6;
-    }
+  multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
+    defm NAME : VOP1_Real_e32e64_vi <op>;
 
-  if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
-    def _dpp_gfx9 :
-      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
-      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+    if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
+    def _sdwa_gfx9 :
+      VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+      VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+        let Inst{42-40} = 6;
+      }
+
+    if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+      def _dpp_gfx9 :
+        VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+        VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+  }
 }
 
 defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 
-let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
+let AssemblerPredicate = isGFX940Plus in
 defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
 
 let OtherPredicates = [HasFP8ConversionInsts] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 4437d5f..13fe79b 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -430,7 +430,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
                     getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
                     DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   let InsVOP3Base = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, RegisterOperand<VGPR_32>, 3,
                        0, HasModifiers, HasModifiers, HasOMod,
                        Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret;
@@ -447,7 +447,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
   let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                      getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
-                     dpp8:$dpp8, DppFI:$fi);
+                     dpp8:$dpp8, Dpp8FI:$fi);
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
@@ -500,7 +500,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
   let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                      getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
-                     dpp8:$dpp8, DppFI:$fi);
+                     dpp8:$dpp8, Dpp8FI:$fi);
   let Src2Mod = FP32InputMods; // dummy unused modifiers
   let Src2RC64 = VGPRSrc_32;   // stub argument
 }
@@ -552,11 +552,11 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], /*EnableClamp=*/
                     Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
                     DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   let InsDPP8 = (ins DstRCDPP:$old,
                     Src0DPP:$src0,
                     Src1DPP:$src1,
-                    dpp8:$dpp8, DppFI:$fi);
+                    dpp8:$dpp8, Dpp8FI:$fi);
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
   let OutsVOP3DPP = Outs64;
@@ -594,11 +594,11 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableClamp=*/1>
                     Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
                     DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   let InsDPP8 = (ins DstRCDPP:$old,
                      Src0DPP:$src0,
                      Src1DPP:$src1,
-                     dpp8:$dpp8, DppFI:$fi);
+                     dpp8:$dpp8, Dpp8FI:$fi);
 
   let HasExt = 1;
   let HasExtDPP = 1;
@@ -645,11 +645,11 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
                     FPVRegInputMods:$src1_modifiers, Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
                     DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   let InsDPP8 = (ins DstRCDPP:$old,
                      FPVRegInputMods:$src0_modifiers, Src0DPP:$src0,
                      FPVRegInputMods:$src1_modifiers, Src1DPP:$src1,
-                     dpp8:$dpp8, DppFI:$fi);
+                     dpp8:$dpp8, Dpp8FI:$fi);
 
   let Src0ModVOP3DPP = FPVRegInputMods;
   let Src1ModVOP3DPP = FPVRegInputMods;
@@ -1273,7 +1273,7 @@ class VOP2_DPP16_Gen<bits<6> op, VOP2_DPP_Pseudo ps, GFXGen Gen,
     VOP2_DPP16<op, ps, Gen.Subtarget, opName, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
   let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
-  let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+  let DecoderNamespace = Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
 
@@ -1302,7 +1302,7 @@ class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen,
     VOP2_DPP8<op, ps, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
   let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
-  let DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+  let DecoderNamespace = Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
 
@@ -1748,9 +1748,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   }
   multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
     if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
-    def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
-      let DecoderNamespace = "DPP8";
-    }
+    def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
   }
 
   //===------------------------- VOP2 (with name) -------------------------===//
@@ -1797,7 +1795,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
     def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
       VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
       let AsmString = asmName # ps.Pfl.AsmDPP8;
-      let DecoderNamespace = "DPP8";
     }
   }
 
@@ -1876,7 +1873,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
       VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
         let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
-        let DecoderNamespace = "DPP8";
       }
     if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
     def _dpp8_w32_gfx10 :
@@ -2231,7 +2227,7 @@ multiclass VOP2_SDWA9_Real <bits<6> op> {
     VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 }
 
-let AssemblerPredicate = isGFX8Only in {
+let AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" in {
 
 multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> {
   def _e32_vi :
@@ -2239,14 +2235,12 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
     VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
       VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
       let AsmString = AsmName # ps.AsmOperands;
-      let DecoderNamespace = "GFX8";
     }
   def _e64_vi :
     VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>,
     VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
       VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
       let AsmString = AsmName # ps.AsmOperands;
-      let DecoderNamespace = "GFX8";
     }
   if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA then
     def _sdwa_vi :
@@ -2263,9 +2257,10 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
         let AsmString = AsmName # ps.AsmOperands;
       }
 }
-}
 
-let AssemblerPredicate = isGFX9Only in {
+} // End AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8"
+
+let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
 
 multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
   def _e32_gfx9 :
@@ -2273,14 +2268,12 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
     VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
       VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
       let AsmString = AsmName # ps.AsmOperands;
-      let DecoderNamespace = "GFX9";
     }
   def _e64_gfx9 :
     VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>,
     VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
       VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
       let AsmString = AsmName # ps.AsmOperands;
-      let DecoderNamespace = "GFX9";
     }
   if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9 then
     def _sdwa_gfx9 :
@@ -2295,21 +2288,16 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
       VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
         VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
         let AsmString = AsmName # ps.AsmOperands;
-        let DecoderNamespace = "GFX9";
       }
 }
 
 multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
   def _e32_gfx9 :
     VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX9>,
-    VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>{
-      let DecoderNamespace = "GFX9";
-    }
+    VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
   def _e64_gfx9 :
     VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
-    VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
-      let DecoderNamespace = "GFX9";
-    }
+    VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
     def _sdwa_gfx9 :
       VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
@@ -2318,12 +2306,10 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
   if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
     def _dpp_gfx9 :
       VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
-      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
-        let DecoderNamespace = "GFX9";
-      }
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
 }
 
-} // AssemblerPredicate = isGFX9Only
+} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
 
 multiclass VOP2_Real_e32e64_vi <bits<6> op> :
   Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 396ae9c..7198a40 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -532,11 +532,11 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
                           FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
                           VGPR_32:$vdst_in, op_sel0:$op_sel,
                           dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
-                          DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+                          DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
   let InsVOP3DPP8 = (ins VGPR_32:$old,
                          FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
                          FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
-                         VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi);
+                         VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi);
 
   let HasClamp = 0;
   let HasExtVOP3DPP = 1;
@@ -553,12 +553,12 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
                           FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
                           FP32InputMods:$src2_modifiers, VGPR_32:$src2,
                           op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
-                          DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+                          DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
   let InsVOP3DPP8 = (ins VGPR_32:$old,
                          FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
                          FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
                          FP32InputMods:$src2_modifiers, VGPR_32:$src2,
-                         op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi);
+                         op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi);
   let HasClamp = 0;
   let HasSrc2 = 0;
   let HasSrc2Mods = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 74f451b..ac3c8f9 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -461,13 +461,13 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>,
 
   let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
                          PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
-                         neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, DppFI:$fi);
+                         neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, Dpp8FI:$fi);
 
   let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
                           PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
                           neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl,
                           DppRowMask:$row_mask, DppBankMask:$bank_mask,
-                          DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+                          DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
 }
 
 multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
@@ -1353,6 +1353,7 @@ class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
   let AssemblerPredicate = HasDPP16;
   let SubtargetPredicate = HasDPP16;
   let OtherPredicates = ps.OtherPredicates;
+  let IsPacked = ps.IsPacked;
 }
 
 class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1362,6 +1363,7 @@ class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
   let SchedRW = ps.SchedRW;
   let Uses = ps.Uses;
   let OtherPredicates = ps.OtherPredicates;
+  let IsPacked = ps.IsPacked;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1486,7 +1488,7 @@ multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME,
       : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
                     Gen.Subtarget> {
     let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
-    let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1496,7 +1498,7 @@ multiclass VOP3P_Real_dpp8<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
   defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
   def _dpp8#Gen.Suffix : VOP3P_DPP8_Base<op, ps> {
     let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
-    let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1613,7 +1615,7 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string
 multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
                                   VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
                                   VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
-  let SubtargetPredicate = isGFX940Plus,
+  let AssemblerPredicate = isGFX940Plus,
       DecoderNamespace = "GFX940",
       AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
   def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>,
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index fe52a0e..e5e8244 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -222,6 +222,8 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
   let SubtargetPredicate = AssemblerPredicate;
+
+  string DecoderNamespace; // dummy
 }
 
 multiclass VOPCInstAliases <string old_name, string Arch, string real_name = old_name, string mnemonic_from = real_name> {
@@ -766,7 +768,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
   let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let AsmDPP16 = AsmDPP#"$fi";
     let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   // DPP8 forbids modifiers and can inherit from VOPC_Profile
 
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
@@ -1331,196 +1333,176 @@ class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
 //===----------------------------------------------------------------------===//
 
 multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> {
-  let AssemblerPredicate = Gen.AssemblerPredicate in {
+  let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
     defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32");
     defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64");
-    let DecoderNamespace = Gen.DecoderNamespace in {
-      def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>,
-                            VOPCe<op{7-0}>;
-      def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>,
-                            VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
-        // Encoding used for VOPC instructions encoded as VOP3 differs from
-        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
-        bits<8> sdst;
-        let Inst{7-0} = sdst;
-      }
-    } // End DecoderNamespace = Gen.DecoderNamespace
+    def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>,
+                          VOPCe<op{7-0}>;
+    def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>,
+                          VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+      // Encoding used for VOPC instructions encoded as VOP3 differs from
+      // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
 
     defm : VOPCInstAliases<NAME, !substr(Gen.Suffix,1)>;
 
     if ps32.Pfl.HasExtDPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp");
       defvar AsmDPP = ps32.Pfl.AsmDPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>;
-        def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
-          let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
-          let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>;
+      def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
+        let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
+        let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
       defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>;
-        def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
-          let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
-          let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>;
+      def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+        let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+        let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
     }
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
       defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
-                                  SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
-        def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
-          let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
-          let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
+                                SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+      def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+        let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+        let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
       defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
-        def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
-          let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
-          let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
+      def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
+        let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
+        let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
     }
-  } // AssemblerPredicate = Gen.AssemblerPredicate
+  } // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
 
 multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
                                string asm_name, string pseudo_mnemonic = ""> {
-  let AssemblerPredicate = Gen.AssemblerPredicate in {
+  let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
     defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32");
     defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64");
-    let DecoderNamespace = Gen.DecoderNamespace in {
-      def _e32#Gen.Suffix :
-        // 32 and 64 bit forms of the instruction have _e32 and _e64
-        // respectively appended to their assembly mnemonic.
-        // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
-        // the destination-less 32bit forms add it to the asmString here.
-        VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">,
-        VOPCe<op{7-0}>,
-        MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
-                          pseudo_mnemonic),
-                      asm_name, ps32.AsmVariantName>,
-        Requires<[Gen.AssemblerPredicate]>;
-      def _e64#Gen.Suffix :
-            VOP3_Real<ps64, Gen.Subtarget, asm_name>,
-            VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>,
-            MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
-                              pseudo_mnemonic),
-                          asm_name, ps64.AsmVariantName>,
-            Requires<[Gen.AssemblerPredicate]> {
-        // Encoding used for VOPC instructions encoded as VOP3 differs from
-        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
-        bits<8> sdst;
-        let Inst{7-0} = sdst;
-      }
-    } // End DecoderNamespace = Gen.DecoderNamespace
+    def _e32#Gen.Suffix :
+      // 32 and 64 bit forms of the instruction have _e32 and _e64
+      // respectively appended to their assembly mnemonic.
+      // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
+      // the destination-less 32bit forms add it to the asmString here.
+      VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">,
+      VOPCe<op{7-0}>,
+      MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
+                        pseudo_mnemonic),
+                    asm_name, ps32.AsmVariantName>,
+      Requires<[Gen.AssemblerPredicate]>;
+    def _e64#Gen.Suffix :
+          VOP3_Real<ps64, Gen.Subtarget, asm_name>,
+          VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>,
+          MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
+                            pseudo_mnemonic),
+                        asm_name, ps64.AsmVariantName>,
+          Requires<[Gen.AssemblerPredicate]> {
+      // Encoding used for VOPC instructions encoded as VOP3 differs from
+      // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
 
     defm : VOPCInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
 
     if ps32.Pfl.HasExtDPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp");
       defvar AsmDPP = ps32.Pfl.AsmDPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
-                                                  Gen.Subtarget, asm_name>;
-        def _e32_dpp_w32#Gen.Suffix
-            : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
-          let AsmString = asm_name # " vcc_lo, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e32_dpp_w64#Gen.Suffix
-            : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
-          let AsmString = asm_name # " vcc, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+                                                Gen.Subtarget, asm_name>;
+      def _e32_dpp_w32#Gen.Suffix
+          : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+        let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e32_dpp_w64#Gen.Suffix
+          : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+        let AsmString = asm_name # " vcc, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
       defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
-        def _e32_dpp8_w32#Gen.Suffix
-            : VOPC_DPP8<op{7-0}, ps32, asm_name> {
-          let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e32_dpp8_w64#Gen.Suffix
-            : VOPC_DPP8<op{7-0}, ps32, asm_name> {
-          let AsmString = asm_name # " vcc, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+      def _e32_dpp8_w32#Gen.Suffix
+          : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+        let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e32_dpp8_w64#Gen.Suffix
+          : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+        let AsmString = asm_name # " vcc, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
     }
 
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
       defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
-                                  SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
-        def _e64_dpp_w32#Gen.Suffix
-            : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
-          let AsmString = asm_name # " vcc_lo, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e64_dpp_w64#Gen.Suffix
-            : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
-          let AsmString = asm_name # " vcc, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
+                                SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+      def _e64_dpp_w32#Gen.Suffix
+          : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+        let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e64_dpp_w64#Gen.Suffix
+          : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+        let AsmString = asm_name # " vcc, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
       defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
-        def _e64_dpp8_w32#Gen.Suffix
-            : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
-          let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e64_dpp8_w64#Gen.Suffix
-            : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
-          let AsmString = asm_name # " vcc, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
+      def _e64_dpp8_w32#Gen.Suffix
+          : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+        let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e64_dpp8_w64#Gen.Suffix
+          : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+        let AsmString = asm_name # " vcc, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
     }
-  } // AssemblerPredicate = Gen.AssemblerPredicate
+  } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
 
 multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
@@ -1528,123 +1510,103 @@ multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
   VOPC_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>;
 
 multiclass VOPCX_Real<GFXGen Gen, bits<9> op> {
-  let AssemblerPredicate = Gen.AssemblerPredicate in {
+  let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
     defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32");
     defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64");
-    let DecoderNamespace = Gen.DecoderNamespace in {
-      def _e32#Gen.Suffix :
-        VOPC_Real<ps32, Gen.Subtarget>,
-        VOPCe<op{7-0}> {
-          let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
-                          # " " # ps32.AsmOperands;
-        }
-      def _e64#Gen.Suffix :
-        VOP3_Real<ps64, Gen.Subtarget>,
-        VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
-          let Inst{7-0} = ?; // sdst
-          let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
-                          # "{_e64} " # ps64.AsmOperands;
-        }
-    } // End DecoderNamespace = Gen.DecoderNamespace
+    def _e32#Gen.Suffix :
+      VOPC_Real<ps32, Gen.Subtarget>,
+      VOPCe<op{7-0}> {
+        let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
+                        # " " # ps32.AsmOperands;
+    }
+    def _e64#Gen.Suffix :
+      VOP3_Real<ps64, Gen.Subtarget>,
+      VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+        let Inst{7-0} = ?; // sdst
+        let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
+                        # "{_e64} " # ps64.AsmOperands;
+    }
 
     defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>;
 
     if ps32.Pfl.HasExtDPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp");
       defvar AsmDPP = ps32.Pfl.AsmDPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e32_dpp#Gen.Suffix
-            : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> {
-          let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
-        }
+      def _e32_dpp#Gen.Suffix
+          : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> {
+        let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
       }
       defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
-          let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
-        }
+      def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+        let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
       }
     }
 
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp");
       defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e64_dpp#Gen.Suffix
-            : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
-              SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
-          let AsmString = !subst("_nosdst", "", psDPP.OpName)
-                          # "{_e64_dpp} " # AsmDPP;
-        }
+      def _e64_dpp#Gen.Suffix
+          : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
+            SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+        let AsmString = !subst("_nosdst", "", psDPP.OpName)
+                        # "{_e64_dpp} " # AsmDPP;
       }
       defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> {
-          let AsmString = !subst("_nosdst", "", ps64.OpName)
-                          # "{_e64_dpp} " # AsmDPP8;
-        }
+      def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> {
+        let AsmString = !subst("_nosdst", "", ps64.OpName)
+                        # "{_e64_dpp} " # AsmDPP8;
       }
     }
-  } // AssemblerPredicate = Gen.AssemblerPredicate
+  } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
 
 multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
       string asm_name, string pseudo_mnemonic = ""> {
-  let AssemblerPredicate = Gen.AssemblerPredicate in {
+  let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
     defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32");
     defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64");
-    let DecoderNamespace = Gen.DecoderNamespace in {
-      def _e32#Gen.Suffix
-          : VOPC_Real<ps32, Gen.Subtarget, asm_name>,
-            MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
-                              pseudo_mnemonic),
-                          asm_name, ps32.AsmVariantName>,
-            Requires<[Gen.AssemblerPredicate]>,
-            VOPCe<op{7-0}> {
-        let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
-      }
-      def _e64#Gen.Suffix
-          : VOP3_Real<ps64, Gen.Subtarget, asm_name>,
-            MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
-                              pseudo_mnemonic),
-                          asm_name, ps64.AsmVariantName>,
-            Requires<[Gen.AssemblerPredicate]>,
-            VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
-        let Inst{7-0} = ? ; // sdst
-        let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
-      }
-    } // End DecoderNamespace = Gen.DecoderNamespace
+    def _e32#Gen.Suffix
+        : VOPC_Real<ps32, Gen.Subtarget, asm_name>,
+          MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
+                            pseudo_mnemonic),
+                        asm_name, ps32.AsmVariantName>,
+          Requires<[Gen.AssemblerPredicate]>,
+          VOPCe<op{7-0}> {
+      let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
+    }
+    def _e64#Gen.Suffix
+        : VOP3_Real<ps64, Gen.Subtarget, asm_name>,
+          MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
+                            pseudo_mnemonic),
+                        asm_name, ps64.AsmVariantName>,
+          Requires<[Gen.AssemblerPredicate]>,
+          VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+      let Inst{7-0} = ? ; // sdst
+      let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+    }
 
     defm : VOPCXInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
 
     if ps32.Pfl.HasExtDPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp");
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
-                                              Gen.Subtarget, asm_name>;
-      }
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
-      }
+      def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+                                            Gen.Subtarget, asm_name>;
+      def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
     }
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp");
       defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e64_dpp#Gen.Suffix
-            : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
-              SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
-          let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
-        }
+      def _e64_dpp#Gen.Suffix
+          : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
+            SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+        let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
       }
       defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
-          let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
-        }
+      def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
+        let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
       }
     }
-  } // AssemblerPredicate = Gen.AssemblerPredicate
+  } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
 
 multiclass VOPCX_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
@@ -1873,21 +1835,19 @@ defm V_CMPX_CLASS_F64     : VOPCX_Real_gfx11_gfx12<0x0ff>;
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Only in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   multiclass VOPC_Real_gfx10<bits<9> op> {
-    let DecoderNamespace = "GFX10" in {
-      def _e32_gfx10 :
-        VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
-        VOPCe<op{7-0}>;
-      def _e64_gfx10 :
-        VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
-        VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
-        // Encoding used for VOPC instructions encoded as VOP3 differs from
-        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
-        bits<8> sdst;
-        let Inst{7-0} = sdst;
-      }
-    } // End DecoderNamespace = "GFX10"
+    def _e32_gfx10 :
+      VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+      VOPCe<op{7-0}>;
+    def _e64_gfx10 :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+      VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+      // Encoding used for VOPC instructions encoded as VOP3 differs from
+      // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
 
     if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
     def _sdwa_gfx10 :
@@ -1898,22 +1858,20 @@ let AssemblerPredicate = isGFX10Only in {
   }
 
   multiclass VOPCX_Real_gfx10<bits<9> op> {
-    let DecoderNamespace = "GFX10" in {
-      def _e32_gfx10 :
-        VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
-        VOPCe<op{7-0}> {
-          let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
-                          # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
-        }
-
-      def _e64_gfx10 :
-        VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
-        VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
-          let Inst{7-0} = ?; // sdst
-          let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
-                          # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
-        }
-    } // End DecoderNamespace = "GFX10"
+    def _e32_gfx10 :
+      VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
+      VOPCe<op{7-0}> {
+        let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
+                        # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
+    }
+
+    def _e64_gfx10 :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
+      VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
+        let Inst{7-0} = ?; // sdst
+        let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
+                        # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
+    }
 
     if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then
     def _sdwa_gfx10 :
@@ -1925,7 +1883,7 @@ let AssemblerPredicate = isGFX10Only in {
 
     defm : VOPCXInstAliases<NAME, "gfx10">;
   }
-} // End AssemblerPredicate = isGFX10Only
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
 defm V_CMP_LT_I16     : VOPC_Real_gfx10<0x089>;
 defm V_CMP_EQ_I16     : VOPC_Real_gfx10<0x08a>;
@@ -1990,25 +1948,23 @@ defm V_CMPX_TRU_F16   : VOPCX_Real_gfx10<0x0ff>;
 // GFX6, GFX7, GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX6GFX7 in {
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
   multiclass VOPC_Real_gfx6_gfx7<bits<9> op> {
-    let DecoderNamespace = "GFX6GFX7" in {
-      def _e32_gfx6_gfx7 :
-        VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
-        VOPCe<op{7-0}>;
-      def _e64_gfx6_gfx7 :
-        VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
-        VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
-        // Encoding used for VOPC instructions encoded as VOP3 differs from
-        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
-        bits<8> sdst;
-        let Inst{7-0} = sdst;
-      }
-    } // End DecoderNamespace = "GFX6GFX7"
+    def _e32_gfx6_gfx7 :
+      VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+      VOPCe<op{7-0}>;
+    def _e64_gfx6_gfx7 :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+      VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+      // Encoding used for VOPC instructions encoded as VOP3 differs from
+      // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
 
     defm : VOPCInstAliases<NAME, "gfx6_gfx7">;
   }
-} // End AssemblerPredicate = isGFX6GFX7
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
 
 multiclass VOPC_Real_gfx6_gfx7_gfx10<bits<9> op> :
   VOPC_Real_gfx6_gfx7<op>, VOPC_Real_gfx10<op>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 801afab..80d7d96 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -818,6 +818,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   let VALU = 1;
   let DPP = 1;
   let Size = 8;
+  let IsPacked = P.IsPacked;
 
   let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP);
 
@@ -835,7 +836,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
   let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
-  let DecoderNamespace = "DPP";
+  let DecoderNamespace = "GFX8";
 
   VOPProfile Pfl = P;
 }
@@ -906,7 +907,7 @@ class VOP_DPP_Base <string OpName, VOPProfile P,
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
   let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
-  let DecoderNamespace = "DPP";
+  let DecoderNamespace = "GFX8";
 }
 
 class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
@@ -1350,7 +1351,7 @@ class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
   VOP3_DPP16 <op, ps, Gen.Subtarget, opName> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
   let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
-  let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+  let DecoderNamespace = Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
 
@@ -1463,7 +1464,7 @@ multiclass VOP3_Real_dpp_with_name<GFXGen Gen, bits<10> op, string opName,
 multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
   defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
   def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
-    let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1473,7 +1474,7 @@ multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME>
   def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
     let Inst{11} = ?;
     let Inst{12} = ?;
-    let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1482,7 +1483,7 @@ multiclass VOP3_Real_dpp8_with_name<GFXGen Gen, bits<10> op, string opName,
                                     string asmName> {
   defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
   let AsmString = asmName # ps.Pfl.AsmVOP3DPP8,
-      DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+      DecoderNamespace = Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"),
       True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
                             NoTrue16Predicate) in {
@@ -1505,7 +1506,7 @@ multiclass VOP3be_Real_dpp<GFXGen Gen, bits<10> op, string opName,
   defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp");
   def _e64_dpp#Gen.Suffix : Base_VOP3b_DPP16<op, dpp_ps, asmName>,
                             SIMCInstr<dpp_ps.PseudoInstr, Gen.Subtarget> {
-    let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1514,7 +1515,7 @@ multiclass VOP3be_Real_dpp8<GFXGen Gen, bits<10> op, string opName,
                             string asmName> {
   defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
   def _e64_dpp8#Gen.Suffix : VOP3b_DPP8_Base<op, ps, asmName> {
-    let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index c5199aab..00a29f8 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -25,42 +25,6 @@
 using namespace llvm;
 using namespace LegalizeActions;
 
-/// FIXME: The following static functions are SizeChangeStrategy functions
-/// that are meant to temporarily mimic the behaviour of the old legalization
-/// based on doubling/halving non-legal types as closely as possible. This is
-/// not entirly possible as only legalizing the types that are exactly a power
-/// of 2 times the size of the legal types would require specifying all those
-/// sizes explicitly.
-/// In practice, not specifying those isn't a problem, and the below functions
-/// should disappear quickly as we add support for legalizing non-power-of-2
-/// sized types further.
-static void addAndInterleaveWithUnsupported(
-    LegacyLegalizerInfo::SizeAndActionsVec &result,
-    const LegacyLegalizerInfo::SizeAndActionsVec &v) {
-  for (unsigned i = 0; i < v.size(); ++i) {
-    result.push_back(v[i]);
-    if (i + 1 < v[i].first && i + 1 < v.size() &&
-        v[i + 1].first != v[i].first + 1)
-      result.push_back({v[i].first + 1, LegacyLegalizeActions::Unsupported});
-  }
-}
-
-static LegacyLegalizerInfo::SizeAndActionsVec
-widen_8_16(const LegacyLegalizerInfo::SizeAndActionsVec &v) {
-  assert(v.size() >= 1);
-  assert(v[0].first > 17);
-  LegacyLegalizerInfo::SizeAndActionsVec result = {
-      {1, LegacyLegalizeActions::Unsupported},
-      {8, LegacyLegalizeActions::WidenScalar},
-      {9, LegacyLegalizeActions::Unsupported},
-      {16, LegacyLegalizeActions::WidenScalar},
-      {17, LegacyLegalizeActions::Unsupported}};
-  addAndInterleaveWithUnsupported(result, v);
-  auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegacyLegalizeActions::Unsupported});
-  return result;
-}
-
 static bool AEABI(const ARMSubtarget &ST) {
   return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI();
 }
@@ -118,15 +82,14 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
         .libcallFor({s32})
         .clampScalar(0, s32, s32);
 
-  for (unsigned Op : {G_SREM, G_UREM}) {
-    LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
-    if (HasHWDivide)
-      LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Lower);
-    else if (AEABI(ST))
-      LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Custom);
-    else
-      LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Libcall);
-  }
+  auto &REMBuilder =
+      getActionDefinitionsBuilder({G_SREM, G_UREM}).minScalar(0, s32);
+  if (HasHWDivide)
+    REMBuilder.lowerFor({s32});
+  else if (AEABI(ST))
+    REMBuilder.customFor({s32});
+  else
+    REMBuilder.libcallFor({s32});
 
   getActionDefinitionsBuilder(G_INTTOPTR)
       .legalFor({{p0, s32}})
@@ -202,8 +165,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
     LoadStoreBuilder.maxScalar(0, s32);
 
-    for (auto Ty : {s32, s64})
-      LegacyInfo.setAction({G_FNEG, Ty}, LegacyLegalizeActions::Lower);
+    getActionDefinitionsBuilder(G_FNEG).lowerFor({s32, s64});
 
     getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64});
 
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 5215813..8a3454c 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -35,30 +35,18 @@ def BinaryUintCategory : DXILOpCategory<"Binary uint">;
 def UnaryFloatCategory : DXILOpCategory<"Unary float">;
 def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">;
 
-// Following are the scalar types supported by DXIL operations and are synonymous
-// to llvm_*_ty defined for readability and ease of use in the context of this file.
-
-def voidTy  : LLVMType<isVoid>;
-
-// Floating point types
-def f16Ty   : LLVMType<f16>;
-def f32Ty   : LLVMType<f32>;
-def f64Ty   : LLVMType<f64>;
-
-// Integer types
-def i1Ty   : LLVMType<i1>;
-def i8Ty   : LLVMType<i8>;
-def i16Ty  : LLVMType<i16>;
-def i32Ty  : LLVMType<i32>;
-def i64Ty  : LLVMType<i64>;
+// Represent as any pointer type with an option to change to a qualified pointer
+// type with address space specified.
+def dxil_handle_ty  : LLVMAnyPointerType;
+def dxil_cbuffer_ty : LLVMAnyPointerType;
+def dxil_resource_ty : LLVMAnyPointerType;
 
 // The parameter description for a DXIL operation
-class DXILOpParameter<int pos, string type, string name, string doc,
+class DXILOpParameter<int pos, LLVMType type, string name, string doc,
                  bit isConstant = 0, string enumName = "",
                  int maxValue = 0> {
   int Pos = pos;               // Position in parameter list
-  string Type = type;          // LLVM type name, $o for overload, $r for resource
-                               // type, $cb for legacy cbuffer, $u4 for u4 struct
+  LLVMType ParamType = type;   // Parameter type
   string Name = name;          // Short, unique parameter name
   string Doc = doc;            // Description of this parameter
   bit IsConstant = isConstant; // Whether this parameter requires a constant value in the IR
@@ -108,55 +96,55 @@ class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory
 class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
 
 def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
-  [f16Ty,f32Ty], ReadNone,
+  [llvm_half_ty, llvm_float_ty], ReadNone,
   [
-    DXILOpParameter<0, "$o", "", "operation result">,
-    DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
-    DXILOpParameter<2, "$o", "value", "input value">
+    DXILOpParameter<0, llvm_anyfloat_ty, "", "operation result">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_anyfloat_ty, "value", "input value">
   ],
   ["floats"]>,
   LLVMIntrinsic<int_sin>;
 
-def UMax : DXILOperation< "UMax", 39,  BinaryClass,  BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
-    [i16Ty,i32Ty,i64Ty],  ReadNone,
+def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
+    [llvm_i16_ty, llvm_i32_ty, llvm_i64_ty], ReadNone,
   [
-    DXILOpParameter<0,  "$o",  "",  "operation result">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DXILOpParameter<2,  "$o",  "a",  "input value">,
-    DXILOpParameter<3,  "$o",  "b",  "input value">
+    DXILOpParameter<0, llvm_anyint_ty, "", "operation result">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_anyint_ty, "a", "input value">,
+    DXILOpParameter<3, llvm_anyint_ty, "b", "input value">
   ],
   ["uints"]>,
   LLVMIntrinsic<int_umax>;
 
-def ThreadId : DXILOperation< "ThreadId", 93,  ThreadIdClass, ComputeIDCategory, "reads the thread ID", [i32Ty],  ReadNone,
+def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [llvm_i32_ty], ReadNone,
   [
-    DXILOpParameter<0,  "i32",  "",  "thread ID component">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DXILOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
+    DXILOpParameter<0, llvm_i32_ty, "", "thread ID component">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
   ]>,
   LLVMIntrinsic<int_dx_thread_id>;
 
-def GroupId : DXILOperation< "GroupId", 94,  GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [i32Ty],  ReadNone,
+def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [llvm_i32_ty], ReadNone,
   [
-    DXILOpParameter<0,  "i32",  "",  "group ID component">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DXILOpParameter<2,  "i32",  "component",  "component to read">
+    DXILOpParameter<0, llvm_i32_ty, "", "group ID component">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_i32_ty, "component", "component to read">
   ]>,
   LLVMIntrinsic<int_dx_group_id>;
 
-def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95,  ThreadIdInGroupClass, ComputeIDCategory,
-  "reads the thread ID within the group (SV_GroupThreadID)", [i32Ty],  ReadNone,
+def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory,
+  "reads the thread ID within the group (SV_GroupThreadID)", [llvm_i32_ty], ReadNone,
   [
-    DXILOpParameter<0,  "i32",  "",  "thread ID in group component">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DXILOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
+    DXILOpParameter<0, llvm_i32_ty, "", "thread ID in group component">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
   ]>,
   LLVMIntrinsic<int_dx_thread_id_in_group>;
 
-def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96,  FlattenedThreadIdInGroupClass, ComputeIDCategory,
-   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [i32Ty],  ReadNone,
+def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory,
+   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [llvm_i32_ty], ReadNone,
   [
-    DXILOpParameter<0,  "i32",  "",  "result">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">
+    DXILOpParameter<0, llvm_i32_ty, "", "result">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">
   ]>,
   LLVMIntrinsic<int_dx_flattened_thread_id_in_group>;
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index 76f99b4..2870f0b 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_target(HexagonCodeGen
   HexagonFrameLowering.cpp
   HexagonGenExtract.cpp
   HexagonGenInsert.cpp
+  HexagonGenMemAbsolute.cpp
   HexagonGenMux.cpp
   HexagonGenPredicate.cpp
   HexagonHardwareLoops.cpp
@@ -50,6 +51,7 @@ add_llvm_target(HexagonCodeGen
   HexagonOptAddrMode.cpp
   HexagonOptimizeSZextends.cpp
   HexagonPeephole.cpp
+  HexagonPostIncOpt.cpp
   HexagonRDFOpt.cpp
   HexagonRegisterInfo.cpp
   HexagonSelectionDAGInfo.cpp
@@ -60,6 +62,7 @@ add_llvm_target(HexagonCodeGen
   HexagonTargetMachine.cpp
   HexagonTargetObjectFile.cpp
   HexagonTargetTransformInfo.cpp
+  HexagonTfrCleanup.cpp
   HexagonVectorCombine.cpp
   HexagonVectorLoopCarriedReuse.cpp
   HexagonVectorPrint.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 6024d9f..3b8234c 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1957,7 +1957,8 @@ bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) {
     return false;
   const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
   RegHalf H;
-  if (!matchHalf(0, RC, 0, H))
+  unsigned B = (RS.Sub == Hexagon::isub_hi) ? 32 : 0;
+  if (!matchHalf(0, RC, B, H))
     return false;
   if (H.Low)
     return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
new file mode 100644
index 0000000..afd4963
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
@@ -0,0 +1,274 @@
+//===--- HexagonGenMemAbsolute.cpp - Generate Load/Store Set Absolute ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This pass traverses through all the basic blocks in a function and converts
+// an indexed load/store with offset "0" to a absolute-set load/store
+// instruction as long as the use of the register in the new instruction
+// dominates the rest of the uses and there are more than 2 uses.
+
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "hexagon-abs"
+
+using namespace llvm;
+
+STATISTIC(HexagonNumLoadAbsConversions,
+          "Number of Load instructions converted to absolute-set form");
+STATISTIC(HexagonNumStoreAbsConversions,
+          "Number of Store instructions converted to absolute-set form");
+
+namespace llvm {
+FunctionPass *createHexagonGenMemAbsolute();
+void initializeHexagonGenMemAbsolutePass(PassRegistry &Registry);
+} // namespace llvm
+
+namespace {
+
+class HexagonGenMemAbsolute : public MachineFunctionPass {
+  const HexagonInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+
+public:
+  static char ID;
+  HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) {
+    initializeHexagonGenMemAbsolutePass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Hexagon Generate Load/Store Set Absolute Address Instruction";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+  static bool isValidIndexedLoad(int &Opcode, int &NewOpcode);
+  static bool isValidIndexedStore(int &Opcode, int &NewOpcode);
+};
+} // namespace
+
+char HexagonGenMemAbsolute::ID = 0;
+
+INITIALIZE_PASS(HexagonGenMemAbsolute, "hexagon-gen-load-absolute",
+                "Hexagon Generate Load/Store Set Absolute Address Instruction",
+                false, false)
+
+bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(Fn.getFunction()))
+    return false;
+
+  TII = Fn.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  MRI = &Fn.getRegInfo();
+  TRI = Fn.getRegInfo().getTargetRegisterInfo();
+
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+
+  // Loop over all of the basic blocks
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock *MBB = &*MBBb;
+    // Traverse the basic block
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+         ++MII) {
+      MachineInstr *MI = &*MII;
+      int Opc = MI->getOpcode();
+      if (Opc != Hexagon::CONST32 && Opc != Hexagon::A2_tfrsi)
+        continue;
+
+      const MachineOperand &MO = MI->getOperand(0);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+
+      unsigned DstReg = MO.getReg();
+      if (MRI->use_nodbg_empty(DstReg))
+        continue;
+
+      typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+      use_iterator NextUseMI = MRI->use_nodbg_begin(DstReg);
+
+      MachineInstr *NextMI = NextUseMI->getParent();
+      int NextOpc = NextMI->getOpcode();
+      int NewOpc;
+      bool IsLoad = isValidIndexedLoad(NextOpc, NewOpc);
+
+      if (!IsLoad && !isValidIndexedStore(NextOpc, NewOpc))
+        continue;
+
+      // Base and Offset positions for load and store instructions
+      // Load R(dest), R(base), Imm -> R(dest) = mem(R(base) + Imm)
+      // Store R(base), Imm, R (src) -> mem(R(base) + Imm) = R(src)
+      unsigned BaseRegPos, ImmPos, RegPos;
+      if (!TII->getBaseAndOffsetPosition(*NextMI, BaseRegPos, ImmPos))
+        continue;
+      RegPos = IsLoad ? 0 : 2;
+
+      bool IsGlobal = MI->getOperand(1).isGlobal();
+      if (!MI->getOperand(1).isImm() && !IsGlobal)
+        continue;
+
+      const MachineOperand *BaseOp = nullptr;
+      int64_t Offset;
+      bool Scalable;
+      TII->getMemOperandWithOffset(*NextMI, BaseOp, Offset, Scalable, TRI);
+
+      // Ensure BaseOp is non-null and register type.
+      if (!BaseOp || !BaseOp->isReg())
+        continue;
+
+      if (Scalable)
+        continue;
+
+      unsigned BaseReg = BaseOp->getReg();
+      if ((DstReg != BaseReg) || (Offset != 0))
+        continue;
+
+      const MachineOperand &MO0 = NextMI->getOperand(RegPos);
+
+      if (!MO0.isReg())
+        continue;
+
+      unsigned LoadStoreReg = MO0.getReg();
+
+      // Store: Bail out if the src and base are same (def and use on same
+      // register).
+      if (LoadStoreReg == BaseReg)
+        continue;
+
+      // Insert the absolute-set instruction "I" only if the use of the
+      // BaseReg in "I" dominates the rest of the uses of BaseReg and if
+      // there are more than 2 uses of this BaseReg.
+      bool Dominates = true;
+      unsigned Counter = 0;
+      for (use_iterator I = NextUseMI, E = MRI->use_nodbg_end(); I != E; ++I) {
+        Counter++;
+        if (!MDT.dominates(NextMI, I->getParent()))
+          Dominates = false;
+      }
+
+      if ((!Dominates) || (Counter < 3))
+        continue;
+
+      // If we reach here, we have met all the conditions required for the
+      // replacement of the absolute instruction.
+      LLVM_DEBUG({
+        dbgs() << "Found a pair of instructions for absolute-set "
+               << (IsLoad ? "load" : "store") << "\n";
+        dbgs() << *MI;
+        dbgs() << *NextMI;
+      });
+      MachineBasicBlock *ParentBlock = NextMI->getParent();
+      MachineInstrBuilder MIB;
+      if (IsLoad) { // Insert absolute-set load instruction
+        ++HexagonNumLoadAbsConversions;
+        MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(),
+                      TII->get(NewOpc), LoadStoreReg)
+                  .addReg(DstReg, RegState::Define);
+      } else { // Insert absolute-set store instruction
+        ++HexagonNumStoreAbsConversions;
+        MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(),
+                      TII->get(NewOpc), DstReg);
+      }
+
+      MachineOperand ImmOperand = MI->getOperand(1);
+      if (IsGlobal)
+        MIB.addGlobalAddress(ImmOperand.getGlobal(), ImmOperand.getOffset(),
+                             ImmOperand.getTargetFlags());
+      else
+        MIB.addImm(ImmOperand.getImm());
+
+      if (IsLoad)
+        MIB->getOperand(0).setSubReg(MO0.getSubReg());
+      else
+        MIB.addReg(LoadStoreReg, 0, MO0.getSubReg());
+
+      LLVM_DEBUG(dbgs() << "Replaced with " << *MIB << "\n");
+      // Erase the instructions that got replaced.
+      MII = MBB->erase(MI);
+      --MII;
+      NextMI->getParent()->erase(NextMI);
+    }
+  }
+
+  return true;
+}
+
+bool HexagonGenMemAbsolute::isValidIndexedLoad(int &Opc, int &NewOpc) {
+
+  bool Result = true;
+  switch (Opc) {
+  case Hexagon::L2_loadrb_io:
+    NewOpc = Hexagon::L4_loadrb_ap;
+    break;
+  case Hexagon::L2_loadrh_io:
+    NewOpc = Hexagon::L4_loadrh_ap;
+    break;
+  case Hexagon::L2_loadri_io:
+    NewOpc = Hexagon::L4_loadri_ap;
+    break;
+  case Hexagon::L2_loadrd_io:
+    NewOpc = Hexagon::L4_loadrd_ap;
+    break;
+  case Hexagon::L2_loadruh_io:
+    NewOpc = Hexagon::L4_loadruh_ap;
+    break;
+  case Hexagon::L2_loadrub_io:
+    NewOpc = Hexagon::L4_loadrub_ap;
+    break;
+  default:
+    Result = false;
+  }
+
+  return Result;
+}
+
+bool HexagonGenMemAbsolute::isValidIndexedStore(int &Opc, int &NewOpc) {
+
+  bool Result = true;
+  switch (Opc) {
+  case Hexagon::S2_storerd_io:
+    NewOpc = Hexagon::S4_storerd_ap;
+    break;
+  case Hexagon::S2_storeri_io:
+    NewOpc = Hexagon::S4_storeri_ap;
+    break;
+  case Hexagon::S2_storerh_io:
+    NewOpc = Hexagon::S4_storerh_ap;
+    break;
+  case Hexagon::S2_storerb_io:
+    NewOpc = Hexagon::S4_storerb_ap;
+    break;
+  default:
+    Result = false;
+  }
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonGenMemAbsolute() {
+  return new HexagonGenMemAbsolute();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 619c7dc..91cc930 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1655,6 +1655,13 @@ bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const {
   return getAddrMode(MI) == HexagonII::PostInc;
 }
 
+bool HexagonInstrInfo::isPostIncWithImmOffset(const MachineInstr &MI) const {
+  unsigned BasePos, OffsetPos;
+  if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+    return false;
+  return isPostIncrement(MI) && MI.getOperand(OffsetPos).isImm();
+}
+
 // Returns true if an instruction is predicated irrespective of the predicate
 // sense. For example, all of the following will return true.
 // if (p0) R1 = add(R2, R3)
@@ -2436,6 +2443,55 @@ bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
          Opcode == Hexagon::J2_loop1rext;
 }
 
+bool HexagonInstrInfo::isCircBufferInstr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case Hexagon::L2_loadalignb_pci:
+  case Hexagon::L2_loadalignb_pcr:
+  case Hexagon::L2_loadalignh_pci:
+  case Hexagon::L2_loadalignh_pcr:
+  case Hexagon::L2_loadbsw2_pci:
+  case Hexagon::L2_loadbsw2_pcr:
+  case Hexagon::L2_loadbsw4_pci:
+  case Hexagon::L2_loadbsw4_pcr:
+  case Hexagon::L2_loadbzw2_pci:
+  case Hexagon::L2_loadbzw2_pcr:
+  case Hexagon::L2_loadbzw4_pci:
+  case Hexagon::L2_loadbzw4_pcr:
+  case Hexagon::L2_loadrb_pci:
+  case Hexagon::L2_loadrb_pcr:
+  case Hexagon::L2_loadrd_pci:
+  case Hexagon::L2_loadrd_pcr:
+  case Hexagon::L2_loadrh_pci:
+  case Hexagon::L2_loadrh_pcr:
+  case Hexagon::L2_loadri_pci:
+  case Hexagon::L2_loadri_pcr:
+  case Hexagon::L2_loadrub_pci:
+  case Hexagon::L2_loadrub_pcr:
+  case Hexagon::L2_loadruh_pci:
+  case Hexagon::L2_loadruh_pcr:
+  case Hexagon::S2_storerbnew_pci:
+  case Hexagon::S2_storerbnew_pcr:
+  case Hexagon::S2_storerb_pci:
+  case Hexagon::S2_storerb_pcr:
+  case Hexagon::S2_storerd_pci:
+  case Hexagon::S2_storerd_pcr:
+  case Hexagon::S2_storerf_pci:
+  case Hexagon::S2_storerf_pcr:
+  case Hexagon::S2_storerhnew_pci:
+  case Hexagon::S2_storerhnew_pcr:
+  case Hexagon::S2_storerh_pci:
+  case Hexagon::S2_storerh_pcr:
+  case Hexagon::S2_storerinew_pci:
+  case Hexagon::S2_storerinew_pcr:
+  case Hexagon::S2_storeri_pci:
+  case Hexagon::S2_storeri_pcr:
+    return true;
+  }
+  return false;
+}
+
 bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
     default: return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index e496995..65783c5 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -434,6 +434,8 @@ public:
   bool predCanBeUsedAsDotNew(const MachineInstr &MI, Register PredReg) const;
   bool PredOpcodeHasJMP_c(unsigned Opcode) const;
   bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
+  bool isPostIncWithImmOffset(const MachineInstr &MI) const;
+  bool isCircBufferInstr(const MachineInstr &MI) const;
 
   unsigned getAddrMode(const MachineInstr &MI) const;
   MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset,
diff --git a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
new file mode 100644
index 0000000..4c845f2
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
@@ -0,0 +1,689 @@
+//===-- HexagonPostIncOpt.cpp - Hexagon Post Increment Optimization Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Convert post-inc addressing mode into base-offset addressing mode.
+// Ex:
+// original loop:
+// v1 = phi(v0, v3)
+// v2,v3 = post_load v1, 4
+
+// Often, unroller creates below form of post-increments:
+// v1 = phi(v0, v3')
+// v2,v3  = post_load v1, 4
+// v2',v3'= post_load v3, 4
+
+// This can be optimized in two ways
+
+// 1.
+// v1 = phi(v0, v3')
+// v2,v3' = post_load v1, 8
+// v2' = load v3', -4
+//
+// 2.
+// v1 = phi(v0, v3')
+// v2,v3' = post_load v1, 8
+// v2' = load v1, 4
+//
+// Option 2 is favored as we can packetize two memory operations in a single
+// packet. However, this is not always favorable due to memory dependences
+// and in cases where we form a bigger chain of post-increment ops that will
+// create more spills as we can not execute post-increment ops with out
+// executing base-offset instructions.
+//===----------------------------------------------------------------------===//
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-postincopt"
+
+static cl::opt<unsigned> PostIncChainThreshold(
+    "post-inc-chain-threshold", cl::Hidden, cl::init(4),
+    cl::desc("Limit the number of post-inc instructions in a chain."));
+
+static cl::opt<bool> PreferPostIncStore(
+    "prefer-post-inc-store", cl::Hidden, cl::init(true),
+    cl::desc("Prefer post-inc store in a list of loads and stores."));
+
+namespace llvm {
+void initializeHexagonPostIncOptPass(PassRegistry &);
+FunctionPass *createHexagonPostIncOpt();
+} // namespace llvm
+
+namespace {
+
+class HexagonPostIncOpt : public MachineFunctionPass {
+  MachineLoopInfo *MLI = nullptr;
+  const HexagonInstrInfo *HII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+  const HexagonSubtarget *HST = nullptr;
+
+public:
+  static char ID;
+
+  HexagonPostIncOpt() : MachineFunctionPass(ID) {
+    initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Hexagon Post-Inc-Opt Pass"; }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+  bool translatePostIncsInLoop(MachineBasicBlock &MBB);
+  void replacePostIncWithBaseOffset(MachineBasicBlock &MBB) const;
+  void replacePostIncWithBaseOffset(MachineInstr &MI) const;
+  bool isPostIncInsn(MachineInstr &MI) const;
+  void foldAdds(MachineBasicBlock &MBB) const;
+  void updateBaseAndOffset(MachineInstr &MI, MachineInstr &AddMI) const;
+  void removeDeadInstructions(MachineBasicBlock &MBB) const;
+
+  void generatePostInc(MachineBasicBlock &MBB);
+  bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
+  void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
+
+  bool isValidOffset(const MachineInstr &MI, int64_t Offset) const;
+  bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const;
+};
+
+class HexagonPostIncOptSchedDAG : public ScheduleDAGInstrs {
+  HexagonPostIncOpt &Pass;
+
+public:
+  HexagonPostIncOptSchedDAG(HexagonPostIncOpt &P, MachineFunction &MF,
+                            MachineLoopInfo *MLI)
+      : ScheduleDAGInstrs(MF, MLI, false), Pass(P){};
+  void schedule() override;
+  ScheduleDAGTopologicalSort &getTopo() { return Topo; };
+};
+
+} // End anonymous namespace.
+
+char HexagonPostIncOpt::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonPostIncOpt, DEBUG_TYPE,
+                      "Hexagon Post-Inc-Opt Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(HexagonPostIncOpt, DEBUG_TYPE, "Hexagon Post-Inc-Opt Pass",
+                    false, false)
+
+/// Return true if MIA dominates MIB.
+static bool dominates(MachineInstr *MIA, MachineInstr *MIB) {
+  if (MIA->getParent() != MIB->getParent())
+    return false; // Don't know since machine dominator tree is out of date.
+
+  MachineBasicBlock *MBB = MIA->getParent();
+  MachineBasicBlock::iterator I = MBB->instr_begin();
+  // Iterate over the basic block until MIA or MIB is found.
+  for (; &*I != MIA && &*I != MIB; ++I)
+    ;
+
+  // MIA dominates MIB if MIA is found first.
+  return &*I == MIA;
+}
+
+// Return the Phi register value that comes from the loop block.
+static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) {
+  for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+    if (Phi->getOperand(i + 1).getMBB() == LoopBB)
+      return Phi->getOperand(i).getReg();
+  return UINT_MAX;
+}
+
+static bool isAddWithImmValue(const MachineInstr &MI) {
+  // FIXME: For now, only deal with adds that have strict immediate values.
+  // Some A2_addi instructions can be of the form.
+  // %338:intregs = A2_addi %7:intregs, @_ZL7phs_tbl + 16
+  return MI.getOpcode() == Hexagon::A2_addi && MI.getOperand(2).isImm();
+}
+
+// Compute the number of 'real' instructions in the basic block by
+// ignoring terminators.
+static unsigned getBasicBlockSize(MachineBasicBlock &MBB) {
+  unsigned size = 0;
+  for (auto &I : make_range(MBB.begin(), MBB.getFirstTerminator()))
+    if (!I.isDebugInstr())
+      size++;
+  return size;
+}
+
+// Setup Post increment Schedule DAG.
+static void initPISchedDAG(HexagonPostIncOptSchedDAG &PIDAG,
+                           MachineBasicBlock &MBB) {
+  PIDAG.startBlock(&MBB);
+  PIDAG.enterRegion(&MBB, MBB.begin(), MBB.getFirstTerminator(),
+                    getBasicBlockSize(MBB));
+  // Build the graph.
+  PIDAG.schedule();
+  // exitRegion() is an empty function in base class. So, safe to call it here.
+  PIDAG.exitRegion();
+}
+
+// Check if post-increment candidate has any memory dependence on any
+// instruction in the chain.
+static bool hasMemoryDependency(SUnit *PostIncSU,
+                                SmallVector<MachineInstr *, 4> &UseList) {
+
+  // FIXME: Fine tune the order dependence. Probably can only consider memory
+  // related OrderKind.
+  for (auto &Dep : PostIncSU->Succs)
+    if (Dep.getKind() == SDep::Order)
+      if (std::find(UseList.begin(), UseList.end(),
+                    Dep.getSUnit()->getInstr()) != UseList.end())
+        return true;
+
+  return false;
+}
+
+// Fold an add with immediate into either an add or a load or a store.
+void HexagonPostIncOpt::foldAdds(MachineBasicBlock &MBB) const {
+  LLVM_DEBUG(dbgs() << "#Fold add instructions in this block.\n");
+  for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
+    if (!isAddWithImmValue(MI))
+      continue;
+    unsigned DefReg = MI.getOperand(0).getReg();
+    unsigned AddReg = MI.getOperand(1).getReg();
+    int64_t AddImm = MI.getOperand(2).getImm();
+
+    SmallVector<MachineInstr *, 4> UseList;
+    // Gather the uses of add instruction's def reg.
+    for (auto &MO : make_range(MRI->use_begin(DefReg), MRI->use_end())) {
+      MachineInstr *UseMI = MO.getParent();
+      // Deal with only the instuctions that belong to this block.
+      // If we cross this block, the generation of post-increment logic
+      // will not be able to transform to post-inc due to dominance.
+      if (UseMI->getParent() == &MBB)
+        UseList.push_back(UseMI);
+    }
+
+    if (UseList.empty())
+      continue;
+
+    LLVM_DEBUG({
+      dbgs() << "Current instruction considered for folding \n";
+      MI.dump();
+    });
+
+    for (auto UseMI : UseList) {
+      if (isAddWithImmValue(*UseMI)) {
+        int64_t NewImm = AddImm + UseMI->getOperand(2).getImm();
+        // Fold if the new immediate is with in the range.
+        if (HII->isValidOffset(UseMI->getOpcode(), NewImm, TRI, false)) {
+          LLVM_DEBUG({
+            UseMI->dump();
+            dbgs() << "\t is folded in to \n";
+          });
+          UseMI->getOperand(1).setReg(AddReg);
+          UseMI->getOperand(2).setImm(NewImm);
+          LLVM_DEBUG(UseMI->dump());
+        }
+      } else if (HII->isBaseImmOffset(*UseMI)) {
+        LLVM_DEBUG({
+          UseMI->dump();
+          dbgs() << "\t is folded in to \n";
+        });
+        updateBaseAndOffset(*UseMI, MI);
+        LLVM_DEBUG(UseMI->dump());
+      }
+      LLVM_DEBUG(dbgs() << "\n");
+    }
+  }
+  removeDeadInstructions(MBB);
+  LLVM_DEBUG(dbgs() << "#End of the fold instructions logic.\n");
+}
+
+void HexagonPostIncOpt::updateBaseAndOffset(MachineInstr &MI,
+                                            MachineInstr &AddMI) const {
+  assert(HII->isBaseImmOffset(MI));
+  unsigned BasePos, OffsetPos;
+  if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+    return;
+
+  MachineOperand &OffsetOp = MI.getOperand(OffsetPos);
+  MachineOperand &BaseOp = MI.getOperand(BasePos);
+
+  if (BaseOp.getReg() != AddMI.getOperand(0).getReg())
+    return;
+
+  unsigned IncBase = AddMI.getOperand(1).getReg();
+  int64_t IncValue = AddMI.getOperand(2).getImm();
+
+  int64_t NewOffset = OffsetOp.getImm() + IncValue;
+  if (!isValidOffset(MI, NewOffset))
+    return;
+
+  OffsetOp.setImm(NewOffset);
+  BaseOp.setReg(IncBase);
+}
+
+void HexagonPostIncOpt::removeDeadInstructions(MachineBasicBlock &MBB) const {
+  // For MBB, check that the value defined by each instruction is used.
+  // If not, delete it.
+  for (MachineBasicBlock::reverse_instr_iterator MI = MBB.instr_rbegin(),
+                                                 ME = MBB.instr_rend();
+       MI != ME;) {
+    // From DeadMachineInstructionElem. Don't delete inline assembly.
+    if (MI->isInlineAsm()) {
+      ++MI;
+      continue;
+    }
+    bool SawStore = false;
+    // Check if it's safe to remove the instruction due to side effects.
+    if (!MI->isSafeToMove(nullptr, SawStore)) {
+      ++MI;
+      continue;
+    }
+    unsigned Uses = 0;
+    for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+                                    MOE = MI->operands_end();
+         MOI != MOE; ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef())
+        continue;
+      unsigned reg = MOI->getReg();
+      // Assume physical registers are used.
+      if (Register::isPhysicalRegister(reg)) {
+        Uses++;
+        continue;
+      }
+      if (MRI->use_begin(reg) != MRI->use_end())
+        Uses++;
+    }
+    if (!Uses) {
+      MI++->eraseFromParent();
+      continue;
+    }
+    ++MI;
+  }
+}
+
+bool HexagonPostIncOpt::isPostIncInsn(MachineInstr &MI) const {
+  // Predicated post-increments are not yet handled. (ISel is not generating
+  // them yet). Circular buffer instructions should not be handled.
+  return (HII->isPostIncWithImmOffset(MI) && !HII->isPredicated(MI) &&
+          !HII->isCircBufferInstr(MI));
+}
+
+/// For instructions with a base and offset, return true if the new Offset
+/// is a valid value with the correct alignment.
+bool HexagonPostIncOpt::isValidOffset(const MachineInstr &MI,
+                                      int64_t Offset) const {
+  if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false))
+    return false;
+  unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
+  return (Offset & AlignMask) == 0;
+}
+
+bool HexagonPostIncOpt::isValidPostIncValue(const MachineInstr &MI,
+                                            int IncVal) const {
+  unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
+  if ((IncVal & AlignMask) != 0)
+    return false;
+
+  // Number of total bits in the instruction used to encode Inc value.
+  unsigned IncBits = 4;
+  // For HVX instructions, the offset is 3.
+  if (HexagonII::isCVI(MI.getDesc()))
+    IncBits = 3;
+
+  IncBits += Log2_32(HII->getMemAccessSize(MI));
+  if (HII->getMemAccessSize(MI) > 8)
+    IncBits = 16;
+
+  int MinValidVal = -1U << (IncBits - 1);
+  int MaxValidVal = ~(-1U << (IncBits - 1));
+  return (IncVal >= MinValidVal && IncVal <= MaxValidVal);
+}
+
+void HexagonPostIncOptSchedDAG::schedule() {
+  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
+  buildSchedGraph(AA);
+}
+
+// Replace post-increment operations with base+offset counterpart.
+void HexagonPostIncOpt::replacePostIncWithBaseOffset(
+    MachineBasicBlock &MBB) const {
+  LLVM_DEBUG(dbgs() << "#Replacing post-increment instructions with "
+                       "base+offset counterparts.\n");
+
+  SmallVector<MachineInstr *, 4> MIList;
+  for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
+    // Check for eligible post-inc candidates.
+    if (!isPostIncInsn(MI))
+      continue;
+    MIList.push_back(&MI);
+  }
+
+  for (auto MI : MIList)
+    replacePostIncWithBaseOffset(*MI);
+
+  LLVM_DEBUG(dbgs() << "#Done with replacing post-increment instructions.\n");
+}
+
+void HexagonPostIncOpt::replacePostIncWithBaseOffset(MachineInstr &MI) const {
+  short NewOpcode = HII->changeAddrMode_pi_io(MI.getOpcode());
+  if (NewOpcode < 0)
+    return;
+
+  unsigned BasePos = 0, OffsetPos = 0;
+  if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+    return;
+  const MachineOperand &PostIncOffset = MI.getOperand(OffsetPos);
+  const MachineOperand &PostIncBase = MI.getOperand(BasePos);
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand *PostIncDest;
+  MachineInstrBuilder MIB;
+  if (MI.mayLoad()) {
+    PostIncDest = &MI.getOperand(1);
+    const MachineOperand &LDValue = MI.getOperand(0);
+    MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
+    MIB.add(LDValue).add(PostIncBase).addImm(0);
+  } else {
+    PostIncDest = &MI.getOperand(0);
+    const MachineOperand &STValue = MI.getOperand(3);
+    MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
+    MIB.add(PostIncBase).addImm(0).add(STValue);
+  }
+
+  // Transfer memoperands.
+  MIB->cloneMemRefs(*MBB.getParent(), MI);
+
+  // Create an add instruction for the post-inc addition of offset.
+  MachineInstrBuilder MIBA = BuildMI(MBB, MI, DL, HII->get(Hexagon::A2_addi));
+  MIBA.add(*PostIncDest).add(PostIncBase).add(PostIncOffset);
+
+  LLVM_DEBUG({
+    dbgs() << "\n";
+    MI.dump();
+    dbgs() << "\tis tranformed to \n";
+    MIB->dump();
+    MIBA->dump();
+    dbgs() << "\n\n";
+  });
+
+  MI.eraseFromParent();
+}
+
+void HexagonPostIncOpt::generatePostInc(MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "# Generate Post-inc and update uses if needed.\n");
+  MachineBasicBlock::iterator MII = MBB.getFirstNonPHI();
+  MachineBasicBlock::iterator MIE = MBB.instr_begin();
+  bool isOK = true;
+  while (MII != MIE) {
+    MachineInstr *Phi = &*std::prev(MII);
+    MII = std::prev(MII);
+    unsigned LoopVal = getLoopPhiReg(Phi, &MBB);
+    if (LoopVal == UINT_MAX)
+      continue;
+    MachineInstr *LoopInst = MRI->getVRegDef(LoopVal);
+    if (!isAddWithImmValue(*LoopInst))
+      continue;
+
+    if (LoopInst->getOpcode() != Hexagon::A2_addi)
+      continue;
+
+    unsigned AddReg = LoopInst->getOperand(1).getReg();
+    int64_t AddImm = LoopInst->getOperand(2).getImm();
+    SmallVector<MachineInstr *, 4> UseList;
+    MachineInstr *PostIncCandidate = nullptr;
+
+    // Find the probable candidates for Post-increment instruction.
+    SmallVector<MachineInstr *, 4> CandList;
+    for (auto &MO : make_range(MRI->use_begin(AddReg), MRI->use_end())) {
+      MachineInstr *UseMI = MO.getParent();
+
+      if (UseMI == LoopInst)
+        continue;
+
+      if (!dominates(UseMI, LoopInst)) {
+        isOK = false;
+        break;
+      }
+      const MachineOperand *BaseOp = nullptr;
+      int64_t Offset;
+      bool OffsetIsScalable;
+      if (!HII->isBaseImmOffset(*UseMI) ||
+          !HII->getMemOperandWithOffset(*UseMI, BaseOp, Offset,
+                                        OffsetIsScalable, TRI)) {
+        isOK = false;
+        break;
+      }
+      int64_t NewOffset = Offset - AddImm;
+      if (!isValidOffset(*UseMI, NewOffset) || !BaseOp->isReg() ||
+          BaseOp->getReg() != AddReg) {
+        isOK = false;
+        break;
+      }
+      if (OffsetIsScalable) {
+        isOK = false;
+        break;
+      }
+      if (Offset == 0) {
+        // If you have stores in the chain, make sure they are in the beginning
+        // of the list. Eg: LD, LD, ST, ST will end up as LD, LD, PostInc_ST,
+        // ST.
+        if (UseMI->mayStore() && PreferPostIncStore)
+          CandList.insert(CandList.begin(), UseMI);
+        else
+          CandList.push_back(UseMI);
+        continue;
+      }
+      UseList.push_back(UseMI);
+    }
+
+    if (!isOK)
+      continue;
+
+    for (auto MI : CandList) {
+      if (!PostIncCandidate)
+        PostIncCandidate = MI;
+      // Push the rest of the list for updation.
+      else
+        UseList.push_back(MI);
+    }
+
+    // If a candidate is found, replace it with the post-inc instruction.
+    // Also, adjust offset for other uses as needed.
+    if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst))
+      continue;
+
+    // Logic to determine what the base register to be.
+    // There are two choices:
+    //   1. New address register after we updated the post-increment candidate.
+    //      v2,v3 = post_load v1, 4
+    //      v3 is the choice here.
+    //   2. The base register we used in post-increment candidate.
+    //      v2,v3 = post_load v1, 4
+    //      v1 is the choice here.
+    // Use v3  if there is a memory dependence between post-inc instruction and
+    // any other instruction in the chain.
+    // FIXME: We can do some complex DAG analysis based off height and depth and
+    // selectively update other instructions in the chain. Use v3 if there are
+    // more instructions in the chain, otherwise we will end up increasing the
+    // height of the DAG resulting in more spills. By default we have a
+    // threshold controlled by the option "post-inc-chain-threshold" which is
+    // set to 4. v1 is preferred as we can packetize two memory operations in a
+    // single packet in scalar core. But it heavily depends on the structure of
+    // DAG.
+    bool UpdateBaseToNew = false;
+
+    // Do not bother to build a DAG and analyze if the Use list is empty.
+    if (!UseList.empty()) {
+      MachineFunction *MF = MBB.getParent();
+      // Setup the Post-inc schedule DAG.
+      HexagonPostIncOptSchedDAG PIDAG(*this, *MF, MLI);
+      initPISchedDAG(PIDAG, MBB);
+      SUnit *SU = PIDAG.getSUnit(PostIncCandidate);
+      if (hasMemoryDependency(SU, UseList) ||
+          UseList.size() >= PostIncChainThreshold)
+        UpdateBaseToNew = true;
+    }
+
+    if (UpdateBaseToNew) {
+      LLVM_DEBUG(dbgs() << "The heuristic determines to update the uses of the "
+                           "base register of post-increment\n");
+      for (auto UseMI : UseList) {
+        if (!dominates(PostIncCandidate, UseMI))
+          continue;
+        unsigned BasePos, OffsetPos;
+        if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) {
+          // New offset has already been validated; no need to do it again.
+          LLVM_DEBUG({
+            UseMI->dump();
+            dbgs() << "\t is transformed to \n";
+          });
+          int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm;
+          UseMI->getOperand(OffsetPos).setImm(NewOffset);
+          UseMI->getOperand(BasePos).setReg(LoopVal);
+          LLVM_DEBUG(UseMI->dump());
+        }
+      }
+    }
+    replaceWithPostInc(PostIncCandidate, LoopInst);
+  }
+  LLVM_DEBUG(dbgs() << "# End of generation of Post-inc.\n");
+}
+
+bool HexagonPostIncOpt::canReplaceWithPostInc(MachineInstr *MI,
+                                              MachineInstr *AddMI) const {
+  if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0)
+    return false;
+  assert(AddMI->getOpcode() == Hexagon::A2_addi);
+  return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm());
+}
+
+void HexagonPostIncOpt::replaceWithPostInc(MachineInstr *MI,
+                                           MachineInstr *AddMI) const {
+  short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode());
+  assert(NewOpcode >= 0 &&
+         "Couldn't change base offset to post-increment form");
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  const MachineOperand &IncDest = AddMI->getOperand(0);
+  const MachineOperand &IncBase = AddMI->getOperand(1);
+  const MachineOperand &IncValue = AddMI->getOperand(2);
+  MachineInstrBuilder MIB;
+  LLVM_DEBUG({
+    dbgs() << "\n\n";
+    MI->dump();
+    dbgs() << "\t is tranformed to post-inc form of \n";
+  });
+
+  if (MI->mayLoad()) {
+    const MachineOperand &LDValue = MI->getOperand(0);
+    MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
+    MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue);
+  } else {
+    const MachineOperand &STValue = MI->getOperand(2);
+    MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
+    MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue);
+  }
+
+  // Transfer memoperands.
+  MIB->cloneMemRefs(*MBB.getParent(), *MI);
+
+  LLVM_DEBUG({
+    MIB->dump();
+    dbgs() << "As a result this add instruction is erased.\n";
+    AddMI->dump();
+  });
+
+  MI->eraseFromParent();
+  AddMI->eraseFromParent();
+}
+
+bool HexagonPostIncOpt::translatePostIncsInLoop(MachineBasicBlock &MBB) {
+  // Algorithm:
+  // 1. Replace all the post-inc instructions with Base+Offset instruction and
+  // an add instruction in this block.
+  // 2. Fold all the adds in to respective uses.
+  // 3. Generate post-increment instructions and update the uses of the base
+  // register if needed based on constraints.
+
+  replacePostIncWithBaseOffset(MBB);
+  foldAdds(MBB);
+  generatePostInc(MBB);
+  return true;
+}
+
+bool HexagonPostIncOpt::runOnMachineFunction(MachineFunction &MF) {
+
+  // Skip pass if requested.
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  // Get Target Information.
+  MLI = &getAnalysis<MachineLoopInfo>();
+  HST = &MF.getSubtarget<HexagonSubtarget>();
+  TRI = HST->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  HII = HST->getInstrInfo();
+
+  // Skip this pass for TinyCore.
+  // Tiny core allwos partial post increment operations - This constraint can
+  // be imposed inside the pass. In a chain of post-increments, the first can
+  // be post-increment, rest can be adjusted to base+offset (these are
+  // inexpensive in most of the cases);
+  if (HST->isTinyCore())
+    return false;
+
+  LLVM_DEBUG({
+    dbgs() << "Begin: Hexagon Post-Inc-Opt Pass.\n";
+    dbgs() << "Function: " << MF.getName() << "\n";
+  });
+  bool Change = false;
+  std::vector<MachineBasicBlock *> MLBB;
+  for (auto &BB : MF) {
+    // Check if this Basic Block belongs to any loop.
+    auto *LI = MLI->getLoopFor(&BB);
+    // We only deal with inner-most loops that has one block.
+    if (LI && LI->getBlocks().size() == 1) {
+      MachineBasicBlock *MBB = LI->getHeader();
+      // Do not traverse blocks that are already visited.
+      if (std::find(MLBB.begin(), MLBB.end(), MBB) != MLBB.end())
+        continue;
+
+      MLBB.push_back(MBB);
+
+      LLVM_DEBUG(dbgs() << "\n\t Basic Block: " << MBB->getName() << "\n");
+      Change |= translatePostIncsInLoop(*MBB);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "End: Hexagon Post-Inc-Opt Pass\n");
+  return Change;
+}
+
+FunctionPass *llvm::createHexagonPostIncOpt() {
+  return new HexagonPostIncOpt();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 7d4b420..a5ebd64 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -65,6 +65,10 @@ static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
                                           cl::init(true), cl::Hidden,
                                           cl::desc("Early expansion of MUX"));
 
+static cl::opt<bool> EnableTfrCleanup("hexagon-tfr-cleanup", cl::init(true),
+                                      cl::Hidden,
+                                      cl::desc("Cleanup of TFRs/COPYs"));
+
 static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
                                    cl::desc("Enable early if-conversion"));
 
@@ -92,6 +96,10 @@ static cl::opt<bool>
 static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
   cl::desc("Disable splitting double registers"));
 
+static cl::opt<bool>
+    EnableGenMemAbs("hexagon-mem-abs", cl::init(true), cl::Hidden,
+                    cl::desc("Generate absolute set instructions"));
+
 static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
   cl::Hidden, cl::desc("Bit simplification"));
 
@@ -121,6 +129,10 @@ static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden,
                                         cl::init(true),
                                         cl::desc("Enable instsimplify"));
 
+static cl::opt<bool> DisableHexagonPostIncOpt(
+    "hexagon-postinc-opt", cl::Hidden,
+    cl::desc("Disable Hexagon post-increment optimization"));
+
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
 /// library.  In particular, it seems that it is not possible to get
@@ -145,20 +157,24 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
 
 namespace llvm {
   extern char &HexagonExpandCondsetsID;
+  extern char &HexagonTfrCleanupID;
   void initializeHexagonBitSimplifyPass(PassRegistry&);
   void initializeHexagonConstExtendersPass(PassRegistry&);
   void initializeHexagonConstPropagationPass(PassRegistry&);
   void initializeHexagonCopyToCombinePass(PassRegistry&);
   void initializeHexagonEarlyIfConversionPass(PassRegistry&);
   void initializeHexagonExpandCondsetsPass(PassRegistry&);
+  void initializeHexagonGenMemAbsolutePass(PassRegistry &);
   void initializeHexagonGenMuxPass(PassRegistry&);
   void initializeHexagonHardwareLoopsPass(PassRegistry&);
   void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
   void initializeHexagonNewValueJumpPass(PassRegistry&);
   void initializeHexagonOptAddrModePass(PassRegistry&);
   void initializeHexagonPacketizerPass(PassRegistry&);
+  void initializeHexagonPostIncOptPass(PassRegistry &);
   void initializeHexagonRDFOptPass(PassRegistry&);
   void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+  void initializeHexagonTfrCleanupPass(PassRegistry &);
   void initializeHexagonVExtractPass(PassRegistry &);
   void initializeHexagonVectorCombineLegacyPass(PassRegistry&);
   void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
@@ -177,6 +193,7 @@ namespace llvm {
   FunctionPass *createHexagonFixupHwLoops();
   FunctionPass *createHexagonGenExtract();
   FunctionPass *createHexagonGenInsert();
+  FunctionPass *createHexagonGenMemAbsolute();
   FunctionPass *createHexagonGenMux();
   FunctionPass *createHexagonGenPredicate();
   FunctionPass *createHexagonHardwareLoops();
@@ -188,10 +205,12 @@ namespace llvm {
   FunctionPass *createHexagonOptimizeSZextends();
   FunctionPass *createHexagonPacketizer(bool Minimal);
   FunctionPass *createHexagonPeephole();
+  FunctionPass *createHexagonPostIncOpt();
   FunctionPass *createHexagonRDFOpt();
   FunctionPass *createHexagonSplitConst32AndConst64();
   FunctionPass *createHexagonSplitDoubleRegs();
   FunctionPass *createHexagonStoreWidening();
+  FunctionPass *createHexagonTfrCleanup();
   FunctionPass *createHexagonVectorCombineLegacyPass();
   FunctionPass *createHexagonVectorPrint();
   FunctionPass *createHexagonVExtract();
@@ -211,12 +230,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
   initializeHexagonConstPropagationPass(PR);
   initializeHexagonCopyToCombinePass(PR);
   initializeHexagonEarlyIfConversionPass(PR);
+  initializeHexagonGenMemAbsolutePass(PR);
   initializeHexagonGenMuxPass(PR);
   initializeHexagonHardwareLoopsPass(PR);
   initializeHexagonLoopIdiomRecognizeLegacyPassPass(PR);
   initializeHexagonNewValueJumpPass(PR);
   initializeHexagonOptAddrModePass(PR);
   initializeHexagonPacketizerPass(PR);
+  initializeHexagonPostIncOptPass(PR);
   initializeHexagonRDFOptPass(PR);
   initializeHexagonSplitDoubleRegsPass(PR);
   initializeHexagonVectorCombineLegacyPass(PR);
@@ -244,6 +265,8 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
           (HexagonNoOpt ? CodeGenOptLevel::None : OL)),
       TLOF(std::make_unique<HexagonTargetObjectFile>()) {
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+  initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
+  initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
   initAsmInfo();
 }
 
@@ -411,11 +434,20 @@ void HexagonPassConfig::addPreRegAlloc() {
       addPass(createHexagonConstExtenders());
     if (EnableExpandCondsets)
       insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID);
+    if (EnableTfrCleanup)
+      insertPass(&VirtRegRewriterID, &HexagonTfrCleanupID);
     if (!DisableStoreWidening)
       addPass(createHexagonStoreWidening());
+    if (EnableGenMemAbs)
+      addPass(createHexagonGenMemAbsolute());
     if (!DisableHardwareLoops)
       addPass(createHexagonHardwareLoops());
   }
+
+  if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive)
+    if (!DisableHexagonPostIncOpt)
+      addPass(createHexagonPostIncOpt());
+
   if (TM->getOptLevel() >= CodeGenOptLevel::Default)
     addPass(&MachinePipelinerID);
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
new file mode 100644
index 0000000..a4b359a
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
@@ -0,0 +1,324 @@
+//===------- HexagonTfrCleanup.cpp - Hexagon Transfer Cleanup Pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass is to address a situation that appears after register allocaion
+// evey now and then, namely a register copy from a source that was defined
+// as an immediate value in the same block (usually just before the copy).
+//
+// Here is an example of actual code emitted that shows this problem:
+//
+//  .LBB0_5:
+//  {
+//    r5 = zxtb(r8)
+//    r6 = or(r6, ##12345)
+//  }
+//  {
+//    r3 = xor(r1, r2)
+//    r1 = #0               <-- r1 set to #0
+//  }
+//  {
+//    r7 = r1               <-- r7 set to r1
+//    r0 = zxtb(r3)
+//  }
+
+#define DEBUG_TYPE "tfr-cleanup"
+#include "HexagonTargetMachine.h"
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createHexagonTfrCleanup();
+void initializeHexagonTfrCleanupPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+class HexagonTfrCleanup : public MachineFunctionPass {
+public:
+  static char ID;
+  HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) {
+    PassRegistry &R = *PassRegistry::getPassRegistry();
+    initializeHexagonTfrCleanupPass(R);
+  }
+  StringRef getPassName() const override { return "Hexagon TFR Cleanup"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  const HexagonInstrInfo *HII;
+  const TargetRegisterInfo *TRI;
+
+  typedef DenseMap<unsigned, uint64_t> ImmediateMap;
+
+  bool isIntReg(unsigned Reg, bool &Is32);
+  void setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap);
+  bool getReg(unsigned Reg, uint64_t &Val, ImmediateMap &IMap);
+  bool updateImmMap(MachineInstr *MI, ImmediateMap &IMap);
+  bool rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, SlotIndexes *Indexes);
+  bool eraseIfRedundant(MachineInstr *MI, SlotIndexes *Indexes);
+};
+} // namespace
+
+char HexagonTfrCleanup::ID = 0;
+
+namespace llvm {
+char &HexagonTfrCleanupID = HexagonTfrCleanup::ID;
+}
+
+bool HexagonTfrCleanup::isIntReg(unsigned Reg, bool &Is32) {
+  Is32 = Hexagon::IntRegsRegClass.contains(Reg);
+  return Is32 || Hexagon::DoubleRegsRegClass.contains(Reg);
+}
+
+// Assign given value V32 to the specified the register R32 in the map. Only
+// 32-bit registers are valid arguments.
+void HexagonTfrCleanup::setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap) {
+  ImmediateMap::iterator F = IMap.find(R32);
+  if (F == IMap.end())
+    IMap.insert(std::make_pair(R32, V32));
+  else
+    F->second = V32;
+}
+
+// Retrieve a value of the provided register Reg and store it into Val.
+// Return "true" if a value was found, "false" otherwise.
+bool HexagonTfrCleanup::getReg(unsigned Reg, uint64_t &Val,
+                               ImmediateMap &IMap) {
+  bool Is32;
+  if (!isIntReg(Reg, Is32))
+    return false;
+
+  if (Is32) {
+    ImmediateMap::iterator F = IMap.find(Reg);
+    if (F == IMap.end())
+      return false;
+    Val = F->second;
+    return true;
+  }
+
+  // For 64-bit registers, compose the value from the values of its
+  // subregisters.
+  unsigned SubL = TRI->getSubReg(Reg, Hexagon::isub_lo);
+  unsigned SubH = TRI->getSubReg(Reg, Hexagon::isub_hi);
+  ImmediateMap::iterator FL = IMap.find(SubL), FH = IMap.find(SubH);
+  if (FL == IMap.end() || FH == IMap.end())
+    return false;
+  Val = (FH->second << 32) | FL->second;
+  return true;
+}
+
+// Process an instruction and record the relevant information in the imme-
+// diate map.
+bool HexagonTfrCleanup::updateImmMap(MachineInstr *MI, ImmediateMap &IMap) {
+  using namespace Hexagon;
+
+  if (MI->isCall()) {
+    IMap.clear();
+    return true;
+  }
+
+  // If this is an instruction that loads a constant into a register,
+  // record this information in IMap.
+  unsigned Opc = MI->getOpcode();
+  if (Opc == A2_tfrsi || Opc == A2_tfrpi) {
+    unsigned DefR = MI->getOperand(0).getReg();
+    bool Is32;
+    if (!isIntReg(DefR, Is32))
+      return false;
+    if (!MI->getOperand(1).isImm()) {
+      if (!Is32) {
+        IMap.erase(TRI->getSubReg(DefR, isub_lo));
+        IMap.erase(TRI->getSubReg(DefR, isub_hi));
+      } else {
+        IMap.erase(DefR);
+      }
+      return false;
+    }
+    uint64_t Val = MI->getOperand(1).getImm();
+    // If it's a 64-bit register, break it up into subregisters.
+    if (!Is32) {
+      uint32_t VH = (Val >> 32), VL = (Val & 0xFFFFFFFFU);
+      setReg(TRI->getSubReg(DefR, isub_lo), VL, IMap);
+      setReg(TRI->getSubReg(DefR, isub_hi), VH, IMap);
+    } else {
+      setReg(DefR, Val, IMap);
+    }
+    return true;
+  }
+
+  // Not a A2_tfr[sp]i. Invalidate all modified registers in IMap.
+  for (MachineInstr::mop_iterator Mo = MI->operands_begin(),
+                                  E = MI->operands_end();
+       Mo != E; ++Mo) {
+    if (Mo->isRegMask()) {
+      IMap.clear();
+      return true;
+    }
+    if (!Mo->isReg() || !Mo->isDef())
+      continue;
+    unsigned R = Mo->getReg();
+    for (MCRegAliasIterator AR(R, TRI, true); AR.isValid(); ++AR) {
+      ImmediateMap::iterator F = IMap.find(*AR);
+      if (F != IMap.end())
+        IMap.erase(F);
+    }
+  }
+  return true;
+}
+
+// Rewrite the instruction as A2_tfrsi/A2_tfrpi, it is a copy of a source that
+// has a known constant value.
+bool HexagonTfrCleanup::rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap,
+                                     SlotIndexes *Indexes) {
+  using namespace Hexagon;
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case A2_tfr:
+  case A2_tfrp:
+  case COPY:
+    break;
+  default:
+    return false;
+  }
+
+  unsigned DstR = MI->getOperand(0).getReg();
+  unsigned SrcR = MI->getOperand(1).getReg();
+  bool Tmp, Is32;
+  if (!isIntReg(DstR, Is32) || !isIntReg(SrcR, Tmp))
+    return false;
+  assert(Tmp == Is32 && "Register size mismatch");
+  uint64_t Val;
+  bool Found = getReg(SrcR, Val, IMap);
+  if (!Found)
+    return false;
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  int64_t SVal = Is32 ? int32_t(Val) : Val;
+  auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+  MachineInstr *NewMI;
+  if (Is32)
+    NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrsi), DstR).addImm(SVal);
+  else if (isInt<8>(SVal))
+    NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrpi), DstR).addImm(SVal);
+  else if (isInt<8>(SVal >> 32) && isInt<8>(int32_t(Val & 0xFFFFFFFFLL)))
+    NewMI = BuildMI(B, MI, DL, HII->get(A2_combineii), DstR)
+                .addImm(int32_t(SVal >> 32))
+                .addImm(int32_t(Val & 0xFFFFFFFFLL));
+  else if (HST.isTinyCore())
+    // Disable generating CONST64 since it requires load resource.
+    return false;
+  else
+    NewMI = BuildMI(B, MI, DL, HII->get(CONST64), DstR).addImm(Val);
+
+  // Replace the MI to reuse the same slot index
+  if (Indexes)
+    Indexes->replaceMachineInstrInMaps(*MI, *NewMI);
+  MI->eraseFromParent();
+  return true;
+}
+
+// Remove the instruction if it is a self-assignment.
+bool HexagonTfrCleanup::eraseIfRedundant(MachineInstr *MI,
+                                         SlotIndexes *Indexes) {
+  unsigned Opc = MI->getOpcode();
+  unsigned DefR, SrcR;
+  bool IsUndef = false;
+  switch (Opc) {
+  case Hexagon::A2_tfr:
+    // Rd = Rd
+    DefR = MI->getOperand(0).getReg();
+    SrcR = MI->getOperand(1).getReg();
+    IsUndef = MI->getOperand(1).isUndef();
+    break;
+  case Hexagon::A2_tfrt:
+  case Hexagon::A2_tfrf:
+    // if ([!]Pu) Rd = Rd
+    DefR = MI->getOperand(0).getReg();
+    SrcR = MI->getOperand(2).getReg();
+    IsUndef = MI->getOperand(2).isUndef();
+    break;
+  default:
+    return false;
+  }
+  if (DefR != SrcR)
+    return false;
+  if (IsUndef) {
+    MachineBasicBlock &B = *MI->getParent();
+    DebugLoc DL = MI->getDebugLoc();
+    auto DefI = BuildMI(B, MI, DL, HII->get(TargetOpcode::IMPLICIT_DEF), DefR);
+    for (auto &Op : MI->operands())
+      if (Op.isReg() && Op.isDef() && Op.isImplicit())
+        DefI->addOperand(Op);
+  }
+
+  if (Indexes)
+    Indexes->removeMachineInstrFromMaps(*MI);
+  MI->eraseFromParent();
+  return true;
+}
+
+bool HexagonTfrCleanup::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  // Map: 32-bit register -> immediate value.
+  // 64-bit registers are stored through their subregisters.
+  ImmediateMap IMap;
+  SlotIndexes *Indexes = this->getAnalysisIfAvailable<SlotIndexes>();
+
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  HII = HST.getInstrInfo();
+  TRI = HST.getRegisterInfo();
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock &B = *I;
+    MachineBasicBlock::iterator J, F, NextJ;
+    IMap.clear();
+    bool Inserted = false, Erased = false;
+    for (J = B.begin(), F = B.end(); J != F; J = NextJ) {
+      NextJ = std::next(J);
+      MachineInstr *MI = &*J;
+      bool E = eraseIfRedundant(MI, Indexes);
+      Erased |= E;
+      if (E)
+        continue;
+      Inserted |= rewriteIfImm(MI, IMap, Indexes);
+      MachineBasicBlock::iterator NewJ = std::prev(NextJ);
+      updateImmMap(&*NewJ, IMap);
+    }
+    bool BlockC = Inserted | Erased;
+    Changed |= BlockC;
+    if (BlockC && Indexes)
+      Indexes->repairIndexesInRange(&B, B.begin(), B.end());
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+INITIALIZE_PASS(HexagonTfrCleanup, "tfr-cleanup", "Hexagon TFR Cleanup", false,
+                false)
+
+FunctionPass *llvm::createHexagonTfrCleanup() {
+  return new HexagonTfrCleanup();
+}
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index ca98269..9840412 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -18,6 +18,7 @@
 
 #include "HexagonDepITypes.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
 
 namespace llvm {
 
@@ -48,7 +49,7 @@ namespace HexagonII {
 
   // MCInstrDesc TSFlags
   // *** Must match HexagonInstrFormat*.td ***
-  enum {
+  enum HexagonTSFlagsVal {
     // This 7-bit field describes the insn type.
     TypePos = 0,
     TypeMask = 0x7f,
@@ -173,6 +174,11 @@ namespace HexagonII {
     hasUnaryRestrictionMask = 0x1,
   };
 
+  inline unsigned getTSFlags(const MCInstrDesc &MID, HexagonTSFlagsVal Pos,
+                             unsigned Mask) {
+    return (MID.TSFlags >> Pos) & Mask;
+  }
+
   // *** The code above must match HexagonInstrFormat*.td *** //
 
   // Hexagon specific MO operand flag mask.
@@ -275,6 +281,10 @@ namespace HexagonII {
     INST_ICLASS_ALU32_3   = 0xf0000000
   };
 
+  inline bool isCVI(const MCInstrDesc &MID) {
+    return getTSFlags(MID, isCVIPos, isCVIMask) != 0;
+  }
+
   LLVM_ATTRIBUTE_UNUSED
   static unsigned getMemAccessSizeInBytes(MemAccessSize S) {
     switch (S) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ded2f25..3ff8994 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
+    if (Opcode == NVPTX::StoreRetvalI8) {
+      // Fine tune the opcode depending on the size of the operand.
+      // This helps to avoid creating redundant COPY instructions in
+      // InstrEmitter::AddRegisterOperand().
+      switch (Ops[0].getSimpleValueType().SimpleTy) {
+      default:
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreRetvalI8TruncI32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::StoreRetvalI8TruncI64;
+        break;
+      }
+    }
     break;
   case 2:
     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
@@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
+      if (Opcode == NVPTX::StoreParamI8) {
+        // Fine tune the opcode depending on the size of the operand.
+        // This helps to avoid creating redundant COPY instructions in
+        // InstrEmitter::AddRegisterOperand().
+        switch (Ops[0].getSimpleValueType().SimpleTy) {
+        default:
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::StoreParamI8TruncI32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::StoreParamI8TruncI64;
+          break;
+        }
+      }
       break;
     case 2:
       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7d2fe78..66a1010 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -47,6 +47,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
@@ -59,6 +60,7 @@
 #include <cmath>
 #include <cstdint>
 #include <iterator>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
   return DL.getABITypeAlign(Ty);
 }
 
+static bool adjustElementType(EVT &ElementType) {
+  switch (ElementType.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::f16:
+  case MVT::bf16:
+    ElementType = MVT::i16;
+    return true;
+  case MVT::f32:
+  case MVT::v2f16:
+  case MVT::v2bf16:
+    ElementType = MVT::i32;
+    return true;
+  case MVT::f64:
+    ElementType = MVT::i64;
+    return true;
+  }
+}
+
+// Use byte-store when the param address of the argument value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+//
+// This is called in LowerCall() when passing the param values.
+static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
+                                        uint64_t Offset, EVT ElementType,
+                                        SDValue StVal, SDValue &InGlue,
+                                        unsigned ArgID, const SDLoc &dl) {
+  // Bit logic only works on integer types
+  if (adjustElementType(ElementType))
+    StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
+
+  // Store each byte
+  SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+    // Shift the byte to the last byte position
+    SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
+                                   DAG.getConstant(i * 8, dl, MVT::i32));
+    SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
+                               DAG.getConstant(Offset + i, dl, MVT::i32),
+                               ShiftVal, InGlue};
+    // Trunc store only the last byte by using
+    //     st.param.b8
+    // The register type can be larger than b8.
+    Chain = DAG.getMemIntrinsicNode(
+        NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
+        MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
+    InGlue = Chain.getValue(1);
+  }
+  return Chain;
+}
+
+// Use byte-load when the param adress of the returned value is unaligned.
+// This may happen when the returned value is a field of a packed structure.
+static SDValue
+LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
+                           EVT ElementType, SDValue &InGlue,
+                           SmallVectorImpl<SDValue> &TempProxyRegOps,
+                           const SDLoc &dl) {
+  // Bit logic only works on integer types
+  EVT MergedType = ElementType;
+  adjustElementType(MergedType);
+
+  // Load each byte and construct the whole value. Initial value to 0
+  SDValue RetVal = DAG.getConstant(0, dl, MergedType);
+  // LoadParamMemI8 loads into i16 register only
+  SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
+  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+    SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+                              DAG.getConstant(Offset + i, dl, MVT::i32),
+                              InGlue};
+    // This will be selected to LoadParamMemI8
+    SDValue LdVal =
+        DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
+                                MVT::i8, MachinePointerInfo(), Align(1));
+    SDValue TmpLdVal = LdVal.getValue(0);
+    Chain = LdVal.getValue(1);
+    InGlue = LdVal.getValue(2);
+
+    TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
+                           TmpLdVal.getSimpleValueType(), TmpLdVal);
+    TempProxyRegOps.push_back(TmpLdVal);
+
+    SDValue CMask = DAG.getConstant(255, dl, MergedType);
+    SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
+    // Need to extend the i16 register to the whole width.
+    TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
+    // Mask off the high bits. Leave only the lower 8bits.
+    // Do this because we are using loadparam.b8.
+    TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
+    // Shift and merge
+    TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
+    RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
+  }
+  if (ElementType != MergedType)
+    RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+  return RetVal;
+}
+
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        SmallVectorImpl<SDValue> &InVals) const {
 
@@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (NeedAlign)
         PartAlign = commonAlignment(ArgAlign, CurOffset);
 
-      // New store.
-      if (VectorInfo[j] & PVF_FIRST) {
-        assert(StoreOperands.empty() && "Unfinished preceding store.");
-        StoreOperands.push_back(Chain);
-        StoreOperands.push_back(
-            DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
-        StoreOperands.push_back(DAG.getConstant(
-            IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
-            dl, MVT::i32));
-      }
-
       SDValue StVal = OutVals[OIdx];
 
       MVT PromotedVT;
@@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
       }
 
+      // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+      // scalar store. In such cases, fall back to byte stores.
+      if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
+          PartAlign.value() <
+              DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
+        assert(StoreOperands.empty() && "Unfinished preceeding store.");
+        Chain = LowerUnalignedStoreParam(
+            DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
+            StVal, InGlue, ParamCount, dl);
+
+        // LowerUnalignedStoreParam took care of inserting the necessary nodes
+        // into the SDAG, so just move on to the next element.
+        if (!IsByVal)
+          ++OIdx;
+        continue;
+      }
+
+      // New store.
+      if (VectorInfo[j] & PVF_FIRST) {
+        assert(StoreOperands.empty() && "Unfinished preceding store.");
+        StoreOperands.push_back(Chain);
+        StoreOperands.push_back(
+            DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
+
+        StoreOperands.push_back(DAG.getConstant(
+            IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
+            dl, MVT::i32));
+      }
+
       // Record the value to store.
       StoreOperands.push_back(StVal);
 
@@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SmallVector<SDValue, 16> ProxyRegOps;
   SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
+  // An item of the vector is filled if the element does not need a ProxyReg
+  // operation on it and should be added to InVals as is. ProxyRegOps and
+  // ProxyRegTruncates contain empty/none items at the same index.
+  SmallVector<SDValue, 16> RetElts;
+  // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
+  // to use the values of `LoadParam`s and to be replaced later then
+  // `CALLSEQ_END` is added.
+  SmallVector<SDValue, 16> TempProxyRegOps;
 
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
@@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         EltType = MVT::i16;
       }
 
+      // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+      // scalar load. In such cases, fall back to byte loads.
+      if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
+          EltAlign < DL.getABITypeAlign(
+                         TheLoadType.getTypeForEVT(*DAG.getContext()))) {
+        assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+        SDValue Ret = LowerUnalignedLoadRetParam(
+            DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
+        ProxyRegOps.push_back(SDValue());
+        ProxyRegTruncates.push_back(std::optional<MVT>());
+        RetElts.resize(i);
+        RetElts.push_back(Ret);
+
+        continue;
+      }
+
       // Record index of the very first element of the vector.
       if (VectorInfo[i] & PVF_FIRST) {
         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
@@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // will not get lost. Otherwise, during libcalls expansion, the nodes can become
   // dangling.
   for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
+    if (i < RetElts.size() && RetElts[i]) {
+      InVals.push_back(RetElts[i]);
+      continue;
+    }
+
     SDValue Ret = DAG.getNode(
       NVPTXISD::ProxyReg, dl,
       DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
@@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     InVals.push_back(Ret);
   }
 
+  for (SDValue &T : TempProxyRegOps) {
+    SDValue Repl = DAG.getNode(
+        NVPTXISD::ProxyReg, dl,
+        DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
+        {Chain, T.getOperand(0), InGlue});
+    DAG.ReplaceAllUsesWith(T, Repl);
+    DAG.RemoveDeadNode(T.getNode());
+
+    Chain = Repl.getValue(1);
+    InGlue = Repl.getValue(2);
+  }
+
   // set isTailCall to false for now, until we figure out how to express
   // tail call optimization in PTX
   isTailCall = false;
@@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
           Value *srcValue = Constant::getNullValue(PointerType::get(
               EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+
+          const MaybeAlign PartAlign = [&]() -> MaybeAlign {
+            if (aggregateIsPacked)
+              return Align(1);
+            if (NumElts != 1)
+              return std::nullopt;
+            Align PartAlign =
+                (Offsets[parti] == 0 && PAL.getParamAlignment(i))
+                    ? PAL.getParamAlignment(i).value()
+                    : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
+            return commonAlignment(PartAlign, Offsets[parti]);
+          }();
           SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
-                                  MachinePointerInfo(srcValue),
-                                  MaybeAlign(aggregateIsPacked ? 1 : 0),
+                                  MachinePointerInfo(srcValue), PartAlign,
                                   MachineMemOperand::MODereferenceable |
                                       MachineMemOperand::MOInvariant);
           if (P.getNode())
@@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   return Chain;
 }
 
+// Use byte-store when the param adress of the return value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,
+                                      uint64_t Offset, EVT ElementType,
+                                      SDValue RetVal, const SDLoc &dl) {
+  // Bit logic only works on integer types
+  if (adjustElementType(ElementType))
+    RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+  // Store each byte
+  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+    // Shift the byte to the last byte position
+    SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
+                                   DAG.getConstant(i * 8, dl, MVT::i32));
+    SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
+                               ShiftVal};
+    // Trunc store only the last byte by using
+    //     st.param.b8
+    // The register type can be larger than b8.
+    Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+                                    DAG.getVTList(MVT::Other), StoreOperands,
+                                    MVT::i8, MachinePointerInfo(), std::nullopt,
+                                    MachineMemOperand::MOStore);
+  }
+  return Chain;
+}
+
 SDValue
 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  bool isVarArg,
@@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   SmallVector<SDValue, 6> StoreOperands;
   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
-    // New load/store. Record chain and offset operands.
-    if (VectorInfo[i] & PVF_FIRST) {
-      assert(StoreOperands.empty() && "Orphaned operand list.");
-      StoreOperands.push_back(Chain);
-      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
-    }
-
     SDValue OutVal = OutVals[i];
     SDValue RetVal = PromotedOutVals[i];
 
@@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
     }
 
+    // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
+    // for a scalar store. In such cases, fall back to byte stores.
+    if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
+      EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+      Align ElementTypeAlign =
+          DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
+      Align ElementAlign =
+          commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
+      if (ElementAlign < ElementTypeAlign) {
+        assert(StoreOperands.empty() && "Orphaned operand list.");
+        Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
+                                       RetVal, dl);
+
+        // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
+        // into the graph, so just move on to the next element.
+        continue;
+      }
+    }
+
+    // New load/store. Record chain and offset operands.
+    if (VectorInfo[i] & PVF_FIRST) {
+      assert(StoreOperands.empty() && "Orphaned operand list.");
+      StoreOperands.push_back(Chain);
+      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+    }
+
     // Record the value to return.
     StoreOperands.push_back(RetVal);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 55a1955..b3517ce 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2738,6 +2738,8 @@ def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
 
 def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
 def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">;
+def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">;
 def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
 def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
 def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
@@ -2757,6 +2759,8 @@ def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
 def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
 def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
 def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">;
+def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">;
 def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
 def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
 def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 904f1d7..c922098 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2062,8 +2062,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
     MVT SubVecContainerVT = SubVecVT;
     // Establish the correct scalable-vector types for any fixed-length type.
-    if (SubVecVT.isFixedLengthVector())
+    if (SubVecVT.isFixedLengthVector()) {
+      assert(Idx == 0 && V.isUndef());
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
+    }
     if (VT.isFixedLengthVector())
       VT = TLI.getContainerForFixedLengthVector(VT);
 
@@ -2115,8 +2117,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
     MVT SubVecContainerVT = VT;
     // Establish the correct scalable-vector types for any fixed-length type.
-    if (VT.isFixedLengthVector())
+    if (VT.isFixedLengthVector()) {
+      assert(Idx == 0);
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(VT);
+    }
     if (InVT.isFixedLengthVector())
       InVT = TLI.getContainerForFixedLengthVector(InVT);
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7275eb..540c2e7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -691,7 +691,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FP_TO_UINT,  ISD::VP_SETCC,       ISD::VP_SIGN_EXTEND,
         ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE,    ISD::VP_SMIN,
         ISD::VP_SMAX,        ISD::VP_UMIN,        ISD::VP_UMAX,
-        ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE};
+        ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE,
+        ISD::VP_SADDSAT,     ISD::VP_UADDSAT,     ISD::VP_SSUBSAT,
+        ISD::VP_USUBSAT};
 
     static const unsigned FloatingPointVPOps[] = {
         ISD::VP_FADD,        ISD::VP_FSUB,        ISD::VP_FMUL,
@@ -830,7 +832,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                          VT, Custom);
       setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT,
                          Custom);
-      setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
       setOperationAction({ISD::AVGFLOORU, ISD::AVGCEILU, ISD::SADDSAT,
                           ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT},
                          VT, Legal);
@@ -956,6 +957,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       // between vXf16 and vXf64 must be lowered as sequences which convert via
       // vXf32.
       setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
+      setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
       // Custom-lower insert/extract operations to simplify patterns.
       setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT,
                          Custom);
@@ -3240,45 +3242,49 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF,
 // Note that this method will also match potentially unappealing index
 // sequences, like <i32 0, i32 50939494>, however it is left to the caller to
 // determine whether this is worth generating code for.
-static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
-  unsigned NumElts = Op.getNumOperands();
+static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
+                                                      unsigned EltSizeInBits) {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR");
+  if (!cast<BuildVectorSDNode>(Op)->isConstant())
+    return std::nullopt;
   bool IsInteger = Op.getValueType().isInteger();
 
   std::optional<unsigned> SeqStepDenom;
   std::optional<int64_t> SeqStepNum, SeqAddend;
   std::optional<std::pair<uint64_t, unsigned>> PrevElt;
-  unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits();
-  for (unsigned Idx = 0; Idx < NumElts; Idx++) {
-    // Assume undef elements match the sequence; we just have to be careful
-    // when interpolating across them.
-    if (Op.getOperand(Idx).isUndef())
+  assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits());
+
+  // First extract the ops into a list of constant integer values. This may not
+  // be possible for floats if they're not all representable as integers.
+  SmallVector<std::optional<uint64_t>> Elts(Op.getNumOperands());
+  const unsigned OpSize = Op.getScalarValueSizeInBits();
+  for (auto [Idx, Elt] : enumerate(Op->op_values())) {
+    if (Elt.isUndef()) {
+      Elts[Idx] = std::nullopt;
       continue;
-
-    uint64_t Val;
+    }
     if (IsInteger) {
-      // The BUILD_VECTOR must be all constants.
-      if (!isa<ConstantSDNode>(Op.getOperand(Idx)))
-        return std::nullopt;
-      Val = Op.getConstantOperandVal(Idx) &
-            maskTrailingOnes<uint64_t>(EltSizeInBits);
+      Elts[Idx] = Elt->getAsZExtVal() & maskTrailingOnes<uint64_t>(OpSize);
     } else {
-      // The BUILD_VECTOR must be all constants.
-      if (!isa<ConstantFPSDNode>(Op.getOperand(Idx)))
-        return std::nullopt;
-      if (auto ExactInteger = getExactInteger(
-              cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
-              EltSizeInBits))
-        Val = *ExactInteger;
-      else
+      auto ExactInteger =
+          getExactInteger(cast<ConstantFPSDNode>(Elt)->getValueAPF(), OpSize);
+      if (!ExactInteger)
         return std::nullopt;
+      Elts[Idx] = *ExactInteger;
     }
+  }
+
+  for (auto [Idx, Elt] : enumerate(Elts)) {
+    // Assume undef elements match the sequence; we just have to be careful
+    // when interpolating across them.
+    if (!Elt)
+      continue;
 
     if (PrevElt) {
       // Calculate the step since the last non-undef element, and ensure
       // it's consistent across the entire sequence.
       unsigned IdxDiff = Idx - PrevElt->second;
-      int64_t ValDiff = SignExtend64(Val - PrevElt->first, EltSizeInBits);
+      int64_t ValDiff = SignExtend64(*Elt - PrevElt->first, EltSizeInBits);
 
       // A zero-value value difference means that we're somewhere in the middle
       // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a
@@ -3308,8 +3314,8 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
     }
 
     // Record this non-undef element for later.
-    if (!PrevElt || PrevElt->first != Val)
-      PrevElt = std::make_pair(Val, Idx);
+    if (!PrevElt || PrevElt->first != *Elt)
+      PrevElt = std::make_pair(*Elt, Idx);
   }
 
   // We need to have logged a step for this to count as a legal index sequence.
@@ -3318,21 +3324,12 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
 
   // Loop back through the sequence and validate elements we might have skipped
   // while waiting for a valid step. While doing this, log any sequence addend.
-  for (unsigned Idx = 0; Idx < NumElts; Idx++) {
-    if (Op.getOperand(Idx).isUndef())
+  for (auto [Idx, Elt] : enumerate(Elts)) {
+    if (!Elt)
       continue;
-    uint64_t Val;
-    if (IsInteger) {
-      Val = Op.getConstantOperandVal(Idx) &
-            maskTrailingOnes<uint64_t>(EltSizeInBits);
-    } else {
-      Val = *getExactInteger(
-          cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
-          EltSizeInBits);
-    }
     uint64_t ExpectedVal =
         (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
-    int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits);
+    int64_t Addend = SignExtend64(*Elt - ExpectedVal, EltSizeInBits);
     if (!SeqAddend)
       SeqAddend = Addend;
     else if (Addend != SeqAddend)
@@ -3598,7 +3595,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
   // Try and match index sequences, which we can lower to the vid instruction
   // with optional modifications. An all-undef vector is matched by
   // getSplatValue, above.
-  if (auto SimpleVID = isSimpleVIDSequence(Op)) {
+  if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) {
     int64_t StepNumerator = SimpleVID->StepNumerator;
     unsigned StepDenominator = SimpleVID->StepDenominator;
     int64_t Addend = SimpleVID->Addend;
@@ -3853,11 +3850,10 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   // If we're compiling for an exact VLEN value, we can split our work per
   // register in the register group.
-  const unsigned MinVLen = Subtarget.getRealMinVLen();
-  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
-  if (MinVLen == MaxVLen && VT.getSizeInBits().getKnownMinValue() > MinVLen) {
+  if (const auto VLen = Subtarget.getRealVLen();
+      VLen && VT.getSizeInBits().getKnownMinValue() > *VLen) {
     MVT ElemVT = VT.getVectorElementType();
-    unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+    unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
     MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
     MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
@@ -4768,9 +4764,8 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
 
   // If we don't know exact data layout, not much we can do.  If this
   // is already m1 or smaller, no point in splitting further.
-  const unsigned MinVLen = Subtarget.getRealMinVLen();
-  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
-  if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen)
+  const auto VLen = Subtarget.getRealVLen();
+  if (!VLen || VT.getSizeInBits().getFixedValue() <= *VLen)
     return SDValue();
 
   // Avoid picking up bitrotate patterns which we have a linear-in-lmul
@@ -4781,7 +4776,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
     return SDValue();
 
   MVT ElemVT = VT.getVectorElementType();
-  unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+  unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
   unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
 
   SmallVector<std::pair<int, SmallVector<int>>>
@@ -5759,6 +5754,10 @@ static unsigned getRISCVVLOp(SDValue Op) {
   VP_CASE(SINT_TO_FP) // VP_SINT_TO_FP
   VP_CASE(UINT_TO_FP) // VP_UINT_TO_FP
   VP_CASE(BITREVERSE) // VP_BITREVERSE
+  VP_CASE(SADDSAT)    // VP_SADDSAT
+  VP_CASE(UADDSAT)    // VP_UADDSAT
+  VP_CASE(SSUBSAT)    // VP_SSUBSAT
+  VP_CASE(USUBSAT)    // VP_USUBSAT
   VP_CASE(BSWAP)      // VP_BSWAP
   VP_CASE(CTLZ)       // VP_CTLZ
   VP_CASE(CTTZ)       // VP_CTTZ
@@ -6798,6 +6797,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VP_UDIV:
   case ISD::VP_SREM:
   case ISD::VP_UREM:
+  case ISD::VP_UADDSAT:
+  case ISD::VP_USUBSAT:
+  case ISD::VP_SADDSAT:
+  case ISD::VP_SSUBSAT:
     return lowerVPOp(Op, DAG);
   case ISD::VP_AND:
   case ISD::VP_OR:
@@ -7384,6 +7387,26 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget))
       return V;
 
+    // (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1)
+    // (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2)
+    if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV)) {
+      const APInt &TrueVal = TrueV->getAsAPIntVal();
+      const APInt &FalseVal = FalseV->getAsAPIntVal();
+      const int TrueValCost = RISCVMatInt::getIntMatCost(
+          TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+      const int FalseValCost = RISCVMatInt::getIntMatCost(
+          FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+      bool IsCZERO_NEZ = TrueValCost <= FalseValCost;
+      SDValue LHSVal = DAG.getConstant(
+          IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT);
+      SDValue RHSVal =
+          DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT);
+      SDValue CMOV =
+          DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
+                      DL, VT, LHSVal, CondV);
+      return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal);
+    }
+
     // (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c))
     // Unless we have the short forward branch optimization.
     if (!Subtarget.hasConditionalMoveFusion())
@@ -8313,15 +8336,13 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   // constant index, we can always perform the extract in m1 (or
   // smaller) as we can determine the register corresponding to
   // the index in the register group.
-  const unsigned MinVLen = Subtarget.getRealMinVLen();
-  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+  const auto VLen = Subtarget.getRealVLen();
   if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
-      IdxC && MinVLen == MaxVLen &&
-      VecVT.getSizeInBits().getKnownMinValue() > MinVLen) {
+      IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) {
     MVT M1VT = getLMUL1VT(ContainerVT);
     unsigned OrigIdx = IdxC->getZExtValue();
     EVT ElemVT = VecVT.getVectorElementType();
-    unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+    unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
     unsigned RemIdx = OrigIdx % ElemsPerVReg;
     unsigned SubRegIdx = OrigIdx / ElemsPerVReg;
     unsigned ExtractIdx =
@@ -9782,15 +9803,14 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   if (OrigIdx == 0)
     return Op;
 
-  const unsigned MinVLen = Subtarget.getRealMinVLen();
-  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+  const auto VLen = Subtarget.getRealVLen();
 
   // If the subvector vector is a fixed-length type and we don't know VLEN
   // exactly, we cannot use subregister manipulation to simplify the codegen; we
   // don't know which register of a LMUL group contains the specific subvector
   // as we only know the minimum register size. Therefore we must slide the
   // vector group down the full amount.
-  if (SubVecVT.isFixedLengthVector() && MinVLen != MaxVLen) {
+  if (SubVecVT.isFixedLengthVector() && !VLen) {
     MVT ContainerVT = VecVT;
     if (VecVT.isFixedLengthVector()) {
       ContainerVT = getContainerForFixedLengthVector(VecVT);
@@ -9837,8 +9857,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
   // we have a fixed length subvector, we need to adjust the index by 1/vscale.
   if (SubVecVT.isFixedLengthVector()) {
-    assert(MinVLen == MaxVLen);
-    unsigned Vscale = MinVLen / RISCV::RVVBitsPerBlock;
+    assert(VLen);
+    unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
     auto Decompose =
         RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
             VecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);
@@ -12872,6 +12892,7 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineSubOfBoolean(N, DAG))
     return V;
 
+  EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   // fold (sub 0, (setcc x, 0, setlt)) -> (sra x, xlen - 1)
@@ -12879,7 +12900,6 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
       isNullConstant(N1.getOperand(1))) {
     ISD::CondCode CCVal = cast<CondCodeSDNode>(N1.getOperand(2))->get();
     if (CCVal == ISD::SETLT) {
-      EVT VT = N->getValueType(0);
       SDLoc DL(N);
       unsigned ShAmt = N0.getValueSizeInBits() - 1;
       return DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0),
@@ -12887,6 +12907,29 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // sub (zext, zext) -> sext (sub (zext, zext))
+  //   where the sum of the extend widths match, and the inner zexts
+  //   add at least one bit.  (For profitability on rvv, we use a
+  //   power of two for both inner and outer extend.)
+  if (VT.isVector() && Subtarget.getTargetLowering()->isTypeLegal(VT) &&
+      N0.getOpcode() == N1.getOpcode() && N0.getOpcode() == ISD::ZERO_EXTEND &&
+      N0.hasOneUse() && N1.hasOneUse()) {
+    SDValue Src0 = N0.getOperand(0);
+    SDValue Src1 = N1.getOperand(0);
+    EVT SrcVT = Src0.getValueType();
+    if (Subtarget.getTargetLowering()->isTypeLegal(SrcVT) &&
+        SrcVT == Src1.getValueType() && SrcVT.getScalarSizeInBits() >= 8 &&
+        SrcVT.getScalarSizeInBits() < VT.getScalarSizeInBits() / 2) {
+      LLVMContext &C = *DAG.getContext();
+      EVT ElemVT = VT.getVectorElementType().getHalfSizedIntegerVT(C);
+      EVT NarrowVT = EVT::getVectorVT(C, ElemVT, VT.getVectorElementCount());
+      Src0 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src0), NarrowVT, Src0);
+      Src1 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src1), NarrowVT, Src1);
+      return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
+                         DAG.getNode(ISD::SUB, SDLoc(N), NarrowVT, Src0, Src1));
+    }
+  }
+
   // fold (sub x, (select lhs, rhs, cc, 0, y)) ->
   //      (select lhs, rhs, cc, x, (sub x, y))
   return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false, Subtarget);
@@ -15978,7 +16021,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
 
     if (Index.getOpcode() == ISD::BUILD_VECTOR &&
         MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) {
-      if (std::optional<VIDSequence> SimpleVID = isSimpleVIDSequence(Index);
+      // The sequence will be XLenVT, not the type of Index. Tell
+      // isSimpleVIDSequence this so we avoid overflow.
+      if (std::optional<VIDSequence> SimpleVID =
+              isSimpleVIDSequence(Index, Subtarget.getXLen());
           SimpleVID && SimpleVID->StepDenominator == 1) {
         const int64_t StepNumerator = SimpleVID->StepNumerator;
         const int64_t Addend = SimpleVID->Addend;
diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
index ff21fe1..af864ba 100644
--- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
@@ -143,19 +143,35 @@ static bool isCompressedReg(Register Reg) {
 // Return true if MI is a load for which there exists a compressed version.
 static bool isCompressibleLoad(const MachineInstr &MI) {
   const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
-  const unsigned Opcode = MI.getOpcode();
 
-  return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) ||
-         Opcode == RISCV::LD || Opcode == RISCV::FLD;
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case RISCV::LW:
+  case RISCV::LD:
+    return STI.hasStdExtCOrZca();
+  case RISCV::FLW:
+    return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce();
+  case RISCV::FLD:
+    return STI.hasStdExtCOrZcd();
+  }
 }
 
 // Return true if MI is a store for which there exists a compressed version.
 static bool isCompressibleStore(const MachineInstr &MI) {
   const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
-  const unsigned Opcode = MI.getOpcode();
 
-  return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) ||
-         Opcode == RISCV::SD || Opcode == RISCV::FSD;
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case RISCV::SW:
+  case RISCV::SD:
+    return STI.hasStdExtCOrZca();
+  case RISCV::FSW:
+    return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce();
+  case RISCV::FSD:
+    return STI.hasStdExtCOrZcd();
+  }
 }
 
 // Find a single register and/or large offset which, if compressible, would
@@ -324,8 +340,7 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) {
   const RISCVInstrInfo &TII = *STI.getInstrInfo();
 
   // This optimization only makes sense if compressed instructions are emitted.
-  // FIXME: Support Zca, Zcf, Zcd granularity.
-  if (!STI.hasStdExtC())
+  if (!STI.hasStdExtCOrZca())
     return false;
 
   for (MachineBasicBlock &MBB : Fn) {
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index d15cb61..0be681d 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -88,20 +88,25 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
     let ReleaseAtCycles = noPredReleaseCycles;
   }
 
+  // Define SchedVars
+  def nameMX # PredSchedVar
+      : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>;
+  def nameMX # NoPredSchedVar
+      : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX #"_NoPred")]>;
+  // Allow multiclass to refer to SchedVars -- need to have NAME prefix.
+  defvar PredSchedVar = !cast<SchedVar>(NAME # nameMX # PredSchedVar);
+  defvar NoPredSchedVar = !cast<SchedVar>(NAME # nameMX # NoPredSchedVar);
+
   // Tie behavior to predicate
-  def NAME # nameMX # "_Variant" : SchedWriteVariant<[
-    SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
-    SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
-  ]>;
+  def NAME # nameMX # "_Variant"
+      : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>;
   def : SchedAlias<
     !cast<SchedReadWrite>(nameMX),
     !cast<SchedReadWrite>(NAME # nameMX # "_Variant")>;
 
   if IsWorstCase then {
-    def NAME # name # "_WorstCase_Variant" : SchedWriteVariant<[
-      SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
-      SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
-    ]>;
+    def NAME # name # "_WorstCase_Variant"
+      : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>;
     def : SchedAlias<
       !cast<SchedReadWrite>(name # "_WorstCase"),
       !cast<SchedReadWrite>(NAME # name # "_WorstCase_Variant")>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4b60d7a..9ebf278 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -143,6 +143,10 @@ public:
 #include "RISCVGenSubtargetInfo.inc"
 
   bool hasStdExtCOrZca() const { return HasStdExtC || HasStdExtZca; }
+  bool hasStdExtCOrZcd() const { return HasStdExtC || HasStdExtZcd; }
+  bool hasStdExtCOrZcfOrZce() const {
+    return HasStdExtC || HasStdExtZcf || HasStdExtZce;
+  }
   bool hasStdExtZvl() const { return ZvlLen != 0; }
   bool hasStdExtFOrZfinx() const { return HasStdExtF || HasStdExtZfinx; }
   bool hasStdExtDOrZdinx() const { return HasStdExtD || HasStdExtZdinx; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index adef40e..3e20e45 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -84,7 +84,7 @@ static cl::opt<bool> EnableRISCVDeadRegisterElimination(
 static cl::opt<bool>
     EnableSinkFold("riscv-enable-sink-fold",
                    cl::desc("Enable sinking and folding of instruction copies"),
-                   cl::init(false), cl::Hidden);
+                   cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableLoopDataPrefetch("riscv-enable-loop-data-prefetch", cl::Hidden,
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index e6e3560..28a63b9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -619,7 +619,8 @@ class GroupBuiltin<string name, Op operation> {
                              !eq(operation, OpGroupNonUniformShuffleDown),
                              !eq(operation, OpGroupBroadcast),
                              !eq(operation, OpGroupNonUniformBroadcast),
-                             !eq(operation, OpGroupNonUniformBroadcastFirst));
+                             !eq(operation, OpGroupNonUniformBroadcastFirst),
+                             !eq(operation, OpGroupNonUniformRotateKHR));
   bit HasBoolArg = !or(!and(IsAllOrAny, !eq(IsAllEqual, false)), IsBallot, IsLogical);
 }
 
@@ -877,6 +878,10 @@ defm : DemangledGroupBuiltin<"group_non_uniform_scan_inclusive_logical_xors", Wo
 defm : DemangledGroupBuiltin<"group_non_uniform_scan_exclusive_logical_xors", WorkOrSub, OpGroupNonUniformLogicalXor>;
 defm : DemangledGroupBuiltin<"group_clustered_reduce_logical_xor", WorkOrSub, OpGroupNonUniformLogicalXor>;
 
+// cl_khr_subgroup_rotate / SPV_KHR_subgroup_rotate
+defm : DemangledGroupBuiltin<"group_rotate", OnlySub, OpGroupNonUniformRotateKHR>;
+defm : DemangledGroupBuiltin<"group_clustered_rotate", OnlySub, OpGroupNonUniformRotateKHR>;
+
 // cl_khr_work_group_uniform_arithmetic / SPV_KHR_uniform_group_instructions
 defm : DemangledGroupBuiltin<"group_reduce_imul", OnlyWork, OpGroupIMulKHR>;
 defm : DemangledGroupBuiltin<"group_reduce_mulu", OnlyWork, OpGroupIMulKHR>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index cc438b2..10569ef 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -150,7 +150,8 @@ getKernelArgTypeQual(const Function &F, unsigned ArgIdx) {
 
 static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
                                   SPIRVGlobalRegistry *GR,
-                                  MachineIRBuilder &MIRBuilder) {
+                                  MachineIRBuilder &MIRBuilder,
+                                  const SPIRVSubtarget &ST) {
   // Read argument's access qualifier from metadata or default.
   SPIRV::AccessQualifier::AccessQualifier ArgAccessQual =
       getArgAccessQual(F, ArgIdx);
@@ -169,8 +170,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
     if (MDTypeStr.ends_with("*"))
       ResArgType = GR->getOrCreateSPIRVTypeByName(
           MDTypeStr, MIRBuilder,
-          addressSpaceToStorageClass(
-              OriginalArgType->getPointerAddressSpace()));
+          addressSpaceToStorageClass(OriginalArgType->getPointerAddressSpace(),
+                                     ST));
     else if (MDTypeStr.ends_with("_t"))
       ResArgType = GR->getOrCreateSPIRVTypeByName(
           "opencl." + MDTypeStr.str(), MIRBuilder,
@@ -206,6 +207,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   assert(GR && "Must initialize the SPIRV type registry before lowering args.");
   GR->setCurrentFunc(MIRBuilder.getMF());
 
+  // Get access to information about available extensions
+  const SPIRVSubtarget *ST =
+      static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+
   // Assign types and names to all args, and store their types for later.
   FunctionType *FTy = getOriginalFunctionType(F);
   SmallVector<SPIRVType *, 4> ArgTypeVRegs;
@@ -216,7 +221,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
       // TODO: handle the case of multiple registers.
       if (VRegs[i].size() > 1)
         return false;
-      auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder);
+      auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder, *ST);
       GR->assignSPIRVTypeToVReg(SpirvTy, VRegs[i][0], MIRBuilder.getMF());
       ArgTypeVRegs.push_back(SpirvTy);
 
@@ -318,10 +323,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   if (F.hasName())
     buildOpName(FuncVReg, F.getName(), MIRBuilder);
 
-  // Get access to information about available extensions
-  const auto *ST =
-      static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
-
   // Handle entry points and function linkage.
   if (isEntryPoint(F)) {
     const auto &STI = MIRBuilder.getMF().getSubtarget<SPIRVSubtarget>();
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 47fec74..a1cb630 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -709,7 +709,10 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
     // TODO: change the implementation once opaque pointers are supported
     // in the SPIR-V specification.
     SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder);
-    auto SC = addressSpaceToStorageClass(PType->getAddressSpace());
+    // Get access to information about available extensions
+    const SPIRVSubtarget *ST =
+        static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+    auto SC = addressSpaceToStorageClass(PType->getAddressSpace(), *ST);
     // Null pointer means we have a loop in type definitions, make and
     // return corresponding OpTypeForwardPointer.
     if (SpvElementType == nullptr) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
index f317b26..d34f802 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -31,6 +31,9 @@ public:
     return true;
   }
 
+  // prevent creation of jump tables
+  bool areJTsAllowed(const Function *) const override { return false; }
+
   // This is to prevent sexts of non-i64 vector indices which are generated
   // within general IRTranslator hence type generation for it is omitted.
   MVT getVectorIdxTy(const DataLayout &DL) const override {
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 0f11bc3..7c5252e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -430,6 +430,10 @@ def OpGenericCastToPtrExplicit : Op<123, (outs ID:$r), (ins TYPE:$t, ID:$p, Stor
                               "$r = OpGenericCastToPtrExplicit $t $p $s">;
 def OpBitcast : UnOp<"OpBitcast", 124>;
 
+// SPV_INTEL_usm_storage_classes
+def OpPtrCastToCrossWorkgroupINTEL : UnOp<"OpPtrCastToCrossWorkgroupINTEL", 5934>;
+def OpCrossWorkgroupCastToPtrINTEL : UnOp<"OpCrossWorkgroupCastToPtrINTEL", 5938>;
+
 // 3.42.12 Composite Instructions
 
 def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx),
@@ -765,6 +769,11 @@ def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>;
 def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>;
 def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>;
 
+// SPV_KHR_subgroup_rotate
+def OpGroupNonUniformRotateKHR: Op<4431, (outs ID:$res),
+                  (ins TYPE:$type, ID:$scope, ID:$value, ID:$delta, variable_ops),
+                  "$res = OpGroupNonUniformRotateKHR $type $scope $value $delta">;
+
 // 3.49.7, Constant-Creation Instructions
 
 //  - SPV_INTEL_function_pointers
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 53d19a1..7258d3b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -828,8 +828,18 @@ static bool isGenericCastablePtr(SPIRV::StorageClass::StorageClass SC) {
   }
 }
 
+static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) {
+  switch (SC) {
+  case SPIRV::StorageClass::DeviceOnlyINTEL:
+  case SPIRV::StorageClass::HostOnlyINTEL:
+    return true;
+  default:
+    return false;
+  }
+}
+
 // In SPIR-V address space casting can only happen to and from the Generic
-// storage class. We can also only case Workgroup, CrossWorkgroup, or Function
+// storage class. We can also only cast Workgroup, CrossWorkgroup, or Function
 // pointers to and from Generic pointers. As such, we can convert e.g. from
 // Workgroup to Function by going via a Generic pointer as an intermediary. All
 // other combinations can only be done by a bitcast, and are probably not safe.
@@ -862,13 +872,17 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
   SPIRV::StorageClass::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr);
   SPIRV::StorageClass::StorageClass DstSC = GR.getPointerStorageClass(ResVReg);
 
-  // Casting from an eligable pointer to Generic.
+  // don't generate a cast between identical storage classes
+  if (SrcSC == DstSC)
+    return true;
+
+  // Casting from an eligible pointer to Generic.
   if (DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC))
     return selectUnOp(ResVReg, ResType, I, SPIRV::OpPtrCastToGeneric);
-  // Casting from Generic to an eligable pointer.
+  // Casting from Generic to an eligible pointer.
   if (SrcSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(DstSC))
     return selectUnOp(ResVReg, ResType, I, SPIRV::OpGenericCastToPtr);
-  // Casting between 2 eligable pointers using Generic as an intermediary.
+  // Casting between 2 eligible pointers using Generic as an intermediary.
   if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) {
     Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass);
     SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType(
@@ -886,6 +900,16 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
                           .addUse(Tmp)
                           .constrainAllUses(TII, TRI, RBI);
   }
+
+  // Check if instructions from the SPV_INTEL_usm_storage_classes extension may
+  // be applied
+  if (isUSMStorageClass(SrcSC) && DstSC == SPIRV::StorageClass::CrossWorkgroup)
+    return selectUnOp(ResVReg, ResType, I,
+                      SPIRV::OpPtrCastToCrossWorkgroupINTEL);
+  if (SrcSC == SPIRV::StorageClass::CrossWorkgroup && isUSMStorageClass(DstSC))
+    return selectUnOp(ResVReg, ResType, I,
+                      SPIRV::OpCrossWorkgroupCastToPtrINTEL);
+
   // TODO Should this case just be disallowed completely?
   // We're casting 2 other arbitrary address spaces, so have to bitcast.
   return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast);
@@ -1545,7 +1569,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   }
   SPIRVType *ResType = GR.getOrCreateSPIRVPointerType(
       PointerBaseType, I, TII,
-      addressSpaceToStorageClass(GV->getAddressSpace()));
+      addressSpaceToStorageClass(GV->getAddressSpace(), STI));
 
   std::string GlobalIdent;
   if (!GV->hasName()) {
@@ -1618,7 +1642,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
 
   unsigned AddrSpace = GV->getAddressSpace();
   SPIRV::StorageClass::StorageClass Storage =
-      addressSpaceToStorageClass(AddrSpace);
+      addressSpaceToStorageClass(AddrSpace, STI);
   bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage &&
                   Storage != SPIRV::StorageClass::Function;
   SPIRV::LinkageType::LinkageType LnkType =
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 011a550..4f2e7a2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -102,14 +102,16 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   const LLT p2 = LLT::pointer(2, PSize); // UniformConstant
   const LLT p3 = LLT::pointer(3, PSize); // Workgroup
   const LLT p4 = LLT::pointer(4, PSize); // Generic
-  const LLT p5 = LLT::pointer(5, PSize); // Input
+  const LLT p5 =
+      LLT::pointer(5, PSize); // Input, SPV_INTEL_usm_storage_classes (Device)
+  const LLT p6 = LLT::pointer(6, PSize); // SPV_INTEL_usm_storage_classes (Host)
 
   // TODO: remove copy-pasting here by using concatenation in some way.
   auto allPtrsScalarsAndVectors = {
-      p0,    p1,    p2,    p3,    p4,    p5,    s1,     s8,     s16,
-      s32,   s64,   v2s1,  v2s8,  v2s16, v2s32, v2s64,  v3s1,   v3s8,
-      v3s16, v3s32, v3s64, v4s1,  v4s8,  v4s16, v4s32,  v4s64,  v8s1,
-      v8s8,  v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
+      p0,    p1,    p2,    p3,    p4,     p5,     p6,    s1,   s8,   s16,
+      s32,   s64,   v2s1,  v2s8,  v2s16,  v2s32,  v2s64, v3s1, v3s8, v3s16,
+      v3s32, v3s64, v4s1,  v4s8,  v4s16,  v4s32,  v4s64, v8s1, v8s8, v8s16,
+      v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
 
   auto allScalarsAndVectors = {
       s1,   s8,   s16,   s32,   s64,   v2s1,  v2s8,  v2s16,  v2s32,  v2s64,
@@ -133,8 +135,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
 
   auto allFloatAndIntScalars = allIntScalars;
 
-  auto allPtrs = {p0, p1, p2, p3, p4, p5};
-  auto allWritablePtrs = {p0, p1, p3, p4};
+  auto allPtrs = {p0, p1, p2, p3, p4, p5, p6};
+  auto allWritablePtrs = {p0, p1, p3, p4, p5, p6};
 
   for (auto Opc : TypeFoldingSupportingOpcs)
     getActionDefinitionsBuilder(Opc).custom();
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index dbda287..3be28c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1063,12 +1063,28 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::ExpectAssumeKHR);
     }
     break;
+  case SPIRV::OpPtrCastToCrossWorkgroupINTEL:
+  case SPIRV::OpCrossWorkgroupCastToPtrINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes);
+      Reqs.addCapability(SPIRV::Capability::USMStorageClassesINTEL);
+    }
+    break;
   case SPIRV::OpConstantFunctionPointerINTEL:
     if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
       Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
       Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL);
     }
     break;
+  case SPIRV::OpGroupNonUniformRotateKHR:
+    if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate))
+      report_fatal_error("OpGroupNonUniformRotateKHR instruction requires the "
+                         "following SPIR-V extension: SPV_KHR_subgroup_rotate",
+                         false);
+    Reqs.addExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate);
+    Reqs.addCapability(SPIRV::Capability::GroupNonUniformRotateKHR);
+    Reqs.addCapability(SPIRV::Capability::GroupNonUniform);
+    break;
   case SPIRV::OpGroupIMulKHR:
   case SPIRV::OpGroupFMulKHR:
   case SPIRV::OpGroupBitwiseAndKHR:
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index cbc16fa..1442168 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -122,6 +122,9 @@ static void foldConstantsIntoIntrinsics(MachineFunction &MF) {
 
 static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                            MachineIRBuilder MIB) {
+  // Get access to information about available extensions
+  const SPIRVSubtarget *ST =
+      static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
   SmallVector<MachineInstr *, 10> ToErase;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
@@ -141,7 +144,7 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
           getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB);
       SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
           BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
-          addressSpaceToStorageClass(MI.getOperand(4).getImm()));
+          addressSpaceToStorageClass(MI.getOperand(4).getImm(), *ST));
 
       // If the bitcast would be redundant, replace all uses with the source
       // register.
@@ -250,6 +253,10 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy,
 
 static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                                  MachineIRBuilder MIB) {
+  // Get access to information about available extensions
+  const SPIRVSubtarget *ST =
+      static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
+
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<MachineInstr *, 10> ToErase;
 
@@ -269,7 +276,7 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
             getMDOperandAsType(MI.getOperand(2).getMetadata(), 0), MIB);
         SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
             BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
-            addressSpaceToStorageClass(MI.getOperand(3).getImm()));
+            addressSpaceToStorageClass(MI.getOperand(3).getImm(), *ST));
         MachineInstr *Def = MRI.getVRegDef(Reg);
         assert(Def && "Expecting an instruction that defines the register");
         insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB,
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index e186154..79f1614 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -49,6 +49,12 @@ cl::list<SPIRV::Extension::Extension> Extensions(
         clEnumValN(SPIRV::Extension::SPV_INTEL_optnone, "SPV_INTEL_optnone",
                    "Adds OptNoneINTEL value for Function Control mask that "
                    "indicates a request to not optimize the function."),
+        clEnumValN(SPIRV::Extension::SPV_INTEL_usm_storage_classes,
+                   "SPV_INTEL_usm_storage_classes",
+                   "Introduces two new storage classes that are sub classes of "
+                   "the CrossWorkgroup storage class "
+                   "that provides additional information that can enable "
+                   "optimization."),
         clEnumValN(SPIRV::Extension::SPV_INTEL_subgroups, "SPV_INTEL_subgroups",
                    "Allows work items in a subgroup to share data without the "
                    "use of local memory and work group barriers, and to "
@@ -75,6 +81,10 @@ cl::list<SPIRV::Extension::Extension> Extensions(
             "Allows to use the LinkOnceODR linkage type that is to let "
             "a function or global variable to be merged with other functions "
             "or global variables of the same name when linkage occurs."),
+        clEnumValN(SPIRV::Extension::SPV_KHR_subgroup_rotate,
+                   "SPV_KHR_subgroup_rotate",
+                   "Adds a new instruction that enables rotating values across "
+                   "invocations within a subgroup."),
         clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers,
                    "SPV_INTEL_function_pointers",
                    "Allows translation of function pointers.")));
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 4e5ac0d..b022b97 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -455,6 +455,7 @@ defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions],
 defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>;
 defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
 defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>;
+defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>;
 defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
 defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
 defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_float16_add], []>;
@@ -462,6 +463,7 @@ defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atom
 defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
+defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
@@ -699,6 +701,8 @@ defm IncomingRayPayloadNV : StorageClassOperand<5342, [RayTracingNV]>;
 defm ShaderRecordBufferNV : StorageClassOperand<5343, [RayTracingNV]>;
 defm PhysicalStorageBufferEXT : StorageClassOperand<5349, [PhysicalStorageBufferAddressesEXT]>;
 defm CodeSectionINTEL : StorageClassOperand<5605, [FunctionPointersINTEL]>;
+defm DeviceOnlyINTEL : StorageClassOperand<5936, [USMStorageClassesINTEL]>;
+defm HostOnlyINTEL : StorageClassOperand<5937, [USMStorageClassesINTEL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Dim enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 05f766d..169d7cc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/SPIRVBaseInfo.h"
 #include "SPIRV.h"
 #include "SPIRVInstrInfo.h"
+#include "SPIRVSubtarget.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -146,15 +147,19 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) {
     return 3;
   case SPIRV::StorageClass::Generic:
     return 4;
+  case SPIRV::StorageClass::DeviceOnlyINTEL:
+    return 5;
+  case SPIRV::StorageClass::HostOnlyINTEL:
+    return 6;
   case SPIRV::StorageClass::Input:
     return 7;
   default:
-    llvm_unreachable("Unable to get address space id");
+    report_fatal_error("Unable to get address space id");
   }
 }
 
 SPIRV::StorageClass::StorageClass
-addressSpaceToStorageClass(unsigned AddrSpace) {
+addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) {
   switch (AddrSpace) {
   case 0:
     return SPIRV::StorageClass::Function;
@@ -166,10 +171,18 @@ addressSpaceToStorageClass(unsigned AddrSpace) {
     return SPIRV::StorageClass::Workgroup;
   case 4:
     return SPIRV::StorageClass::Generic;
+  case 5:
+    return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)
+               ? SPIRV::StorageClass::DeviceOnlyINTEL
+               : SPIRV::StorageClass::CrossWorkgroup;
+  case 6:
+    return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)
+               ? SPIRV::StorageClass::HostOnlyINTEL
+               : SPIRV::StorageClass::CrossWorkgroup;
   case 7:
     return SPIRV::StorageClass::Input;
   default:
-    llvm_unreachable("Unknown address space");
+    report_fatal_error("Unknown address space");
   }
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index a33dc02..1af53dc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -27,6 +27,7 @@ class MachineRegisterInfo;
 class Register;
 class StringRef;
 class SPIRVInstrInfo;
+class SPIRVSubtarget;
 
 // Add the given string as a series of integer operand, inserting null
 // terminators and padding to make sure the operands all have 32-bit
@@ -62,7 +63,7 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC);
 
 // Convert an LLVM IR address space to a SPIR-V storage class.
 SPIRV::StorageClass::StorageClass
-addressSpaceToStorageClass(unsigned AddrSpace);
+addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI);
 
 SPIRV::MemorySemantics::MemorySemantics
 getMemSemanticsForStorageClass(SPIRV::StorageClass::StorageClass SC);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7c47790..36f0679 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -43,6 +43,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower"
 
+extern cl::opt<bool> WasmEmitMultiValue;
+
 WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     const TargetMachine &TM, const WebAssemblySubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -1288,7 +1290,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     LLVMContext & /*Context*/) const {
   // WebAssembly can only handle returning tuples with multivalue enabled
-  return Subtarget->hasMultivalue() || Outs.size() <= 1;
+  return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1;
 }
 
 SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -1296,7 +1298,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
-  assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
+  assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) ||
+          Outs.size() <= 1) &&
          "MVP WebAssembly can only return up to one value");
   if (!callingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 1e95911..b969b83 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -22,6 +22,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+extern cl::opt<bool> WasmEmitMultiValue;
+
 WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
 
 MachineFunctionInfo *WebAssemblyFunctionInfo::clone(
@@ -71,7 +73,8 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
 
   MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
   if (Results.size() > 1 &&
-      !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) {
+      (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() ||
+       !WasmEmitMultiValue)) {
     // WebAssembly can't lower returns of multiple values without demoting to
     // sret unless multivalue is enabled (see
     // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 3e2e029..2a84c90 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -24,6 +24,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> WasmEmitMultiValue;
+
 namespace {
 
 enum RuntimeLibcallSignature {
@@ -694,7 +696,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_func_f32:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -703,7 +705,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F32);
     break;
   case i64_i64_func_f64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -712,7 +714,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F64);
     break;
   case i16_i16_func_i16_i16:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -722,7 +724,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i32_i32_func_i32_i32:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -732,7 +734,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -742,7 +744,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -754,7 +756,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_iPTR:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -767,7 +769,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
@@ -781,7 +783,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i32:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -851,7 +853,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -865,7 +867,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i32:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -874,7 +876,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 42043a7..3120b6b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -54,6 +54,15 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
              " irreducible control flow optimization pass"),
     cl::init(false));
 
+// A temporary option to control emission of multivalue until multivalue
+// implementation is stable enough. We currently don't emit multivalue by
+// default even if the feature section allows it.
+// TODO Stabilize multivalue and delete this option
+cl::opt<bool>
+    WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden,
+                       cl::desc("WebAssembly: Emit multivalue in the backend"),
+                       cl::init(false));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index 4a11dd2..a620ba9 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -47,10 +47,9 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
 } // namespace
 
 Error X86TargetMachine::buildCodeGenPipeline(
-    ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
-    MachineFunctionAnalysisManager &, raw_pwrite_stream &Out,
-    raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
-    CGPassBuilderOption Opt, PassInstrumentationCallbacks *PIC) {
+    ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType, CGPassBuilderOption Opt,
+    PassInstrumentationCallbacks *PIC) {
   auto CGPB = X86CodeGenPassBuilder(*this, Opt, PIC);
-  return CGPB.buildPipeline(MPM, MFPM, Out, DwoOut, FileType);
+  return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
 }
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index f31c971..0fd3e47 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -58,10 +58,9 @@ public:
   createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
                             const TargetSubtargetInfo *STI) const override;
 
-  Error buildCodeGenPipeline(ModulePassManager &, MachineFunctionPassManager &,
-                             MachineFunctionAnalysisManager &,
-                             raw_pwrite_stream &, raw_pwrite_stream *,
-                             CodeGenFileType, CGPassBuilderOption,
+  Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
+                             raw_pwrite_stream *, CodeGenFileType,
+                             CGPassBuilderOption,
                              PassInstrumentationCallbacks *) override;
 
   bool isJIT() const { return IsJIT; }
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 4466d50..a4cc757 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1846,6 +1846,13 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["prefetchi"]  = HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
   Features["usermsr"]  = HasLeaf7Subleaf1 && ((EDX >> 15) & 1);
   Features["avx10.1-256"] = HasLeaf7Subleaf1 && ((EDX >> 19) & 1);
+  bool HasAPXF = HasLeaf7Subleaf1 && ((EDX >> 21) & 1);
+  Features["egpr"] = HasAPXF;
+  Features["push2pop2"] = HasAPXF;
+  Features["ppx"] = HasAPXF;
+  Features["ndd"] = HasAPXF;
+  Features["ccmp"] = HasAPXF;
+  Features["cf"] = HasAPXF;
 
   bool HasLeafD = MaxLevel >= 0xd &&
                   !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index fabb3c5f..5aefcbf 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -215,15 +215,10 @@ bool HotColdSplitting::isFunctionCold(const Function &F) const {
   return false;
 }
 
-bool HotColdSplitting::isBasicBlockCold(BasicBlock *BB,
-                          BranchProbability ColdProbThresh,
-                          SmallPtrSetImpl<BasicBlock *> &ColdBlocks,
-                          SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks,
-                          BlockFrequencyInfo *BFI) const {
-  // This block is already part of some outlining region.
-  if (ColdBlocks.count(BB))
-    return true;
-
+bool HotColdSplitting::isBasicBlockCold(
+    BasicBlock *BB, BranchProbability ColdProbThresh,
+    SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks,
+    BlockFrequencyInfo *BFI) const {
   if (BFI) {
     if (PSI->isColdBlock(BB, BFI))
       return true;
@@ -372,18 +367,12 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
   return Penalty;
 }
 
-Function *HotColdSplitting::extractColdRegion(
-    const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
-    DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
-    OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
+// Determine if it is beneficial to split the \p Region.
+bool HotColdSplitting::isSplittingBeneficial(CodeExtractor &CE,
+                                             const BlockSequence &Region,
+                                             TargetTransformInfo &TTI) {
   assert(!Region.empty());
 
-  // TODO: Pass BFI and BPI to update profile information.
-  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
-                   /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
-                   /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
-                   /* Suffix */ "cold." + std::to_string(Count));
-
   // Perform a simple cost/benefit analysis to decide whether or not to permit
   // splitting.
   SetVector<Value *> Inputs, Outputs, Sinks;
@@ -394,9 +383,18 @@ Function *HotColdSplitting::extractColdRegion(
   LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
                     << ", penalty = " << OutliningPenalty << "\n");
   if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
-    return nullptr;
+    return false;
 
-  Function *OrigF = Region[0]->getParent();
+  return true;
+}
+
+// Split the single \p EntryPoint cold region. \p CE is the region code
+// extractor.
+Function *HotColdSplitting::extractColdRegion(
+    BasicBlock &EntryPoint, CodeExtractor &CE,
+    const CodeExtractorAnalysisCache &CEAC, BlockFrequencyInfo *BFI,
+    TargetTransformInfo &TTI, OptimizationRemarkEmitter &ORE) {
+  Function *OrigF = EntryPoint.getParent();
   if (Function *OutF = CE.extractCodeRegion(CEAC)) {
     User *U = *OutF->user_begin();
     CallInst *CI = cast<CallInst>(U);
@@ -419,7 +417,7 @@ Function *HotColdSplitting::extractColdRegion(
     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
     ORE.emit([&]() {
       return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
-                                &*Region[0]->begin())
+                                &*EntryPoint.begin())
              << ore::NV("Original", OrigF) << " split cold code into "
              << ore::NV("Split", OutF);
     });
@@ -428,9 +426,9 @@ Function *HotColdSplitting::extractColdRegion(
 
   ORE.emit([&]() {
     return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
-                                    &*Region[0]->begin())
+                                    &*EntryPoint.begin())
            << "Failed to extract region at block "
-           << ore::NV("Block", Region.front());
+           << ore::NV("Block", &EntryPoint);
   });
   return nullptr;
 }
@@ -620,16 +618,18 @@ public:
 } // namespace
 
 bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
-  bool Changed = false;
-
-  // The set of cold blocks.
+  // The set of cold blocks outlined.
   SmallPtrSet<BasicBlock *, 4> ColdBlocks;
 
+  // The set of cold blocks cannot be outlined.
+  SmallPtrSet<BasicBlock *, 4> CannotBeOutlinedColdBlocks;
+
   // Set of cold blocks obtained with RPOT.
   SmallPtrSet<BasicBlock *, 4> AnnotatedColdBlocks;
 
-  // The worklist of non-intersecting regions left to outline.
-  SmallVector<OutliningRegion, 2> OutliningWorklist;
+  // The worklist of non-intersecting regions left to outline. The first member
+  // of the pair is the entry point into the region to be outlined.
+  SmallVector<std::pair<BasicBlock *, CodeExtractor>, 2> OutliningWorklist;
 
   // Set up an RPO traversal. Experimentally, this performs better (outlines
   // more) than a PO traversal, because we prevent region overlap by keeping
@@ -655,10 +655,18 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
   if (ColdBranchProbDenom.getNumOccurrences())
     ColdProbThresh = BranchProbability(1, ColdBranchProbDenom.getValue());
 
+  unsigned OutlinedFunctionID = 1;
   // Find all cold regions.
   for (BasicBlock *BB : RPOT) {
-    if (!isBasicBlockCold(BB, ColdProbThresh, ColdBlocks, AnnotatedColdBlocks,
-                          BFI))
+    // This block is already part of some outlining region.
+    if (ColdBlocks.count(BB))
+      continue;
+
+    // This block is already part of some region cannot be outlined.
+    if (CannotBeOutlinedColdBlocks.count(BB))
+      continue;
+
+    if (!isBasicBlockCold(BB, ColdProbThresh, AnnotatedColdBlocks, BFI))
       continue;
 
     LLVM_DEBUG({
@@ -681,50 +689,69 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
         return markFunctionCold(F);
       }
 
-      // If this outlining region intersects with another, drop the new region.
-      //
-      // TODO: It's theoretically possible to outline more by only keeping the
-      // largest region which contains a block, but the extra bookkeeping to do
-      // this is tricky/expensive.
-      bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
-        return !ColdBlocks.insert(Block.first).second;
-      });
-      if (RegionsOverlap)
-        continue;
+      do {
+        BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
+        LLVM_DEBUG({
+          dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
+          for (BasicBlock *BB : SubRegion)
+            BB->dump();
+        });
+
+        // TODO: Pass BFI and BPI to update profile information.
+        CodeExtractor CE(
+            SubRegion, &*DT, /* AggregateArgs */ false, /* BFI */ nullptr,
+            /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
+            /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
+            /* Suffix */ "cold." + std::to_string(OutlinedFunctionID));
+
+        if (CE.isEligible() && isSplittingBeneficial(CE, SubRegion, TTI) &&
+            // If this outlining region intersects with another, drop the new
+            // region.
+            //
+            // TODO: It's theoretically possible to outline more by only keeping
+            // the largest region which contains a block, but the extra
+            // bookkeeping to do this is tricky/expensive.
+            none_of(SubRegion, [&](BasicBlock *Block) {
+              return ColdBlocks.contains(Block);
+            })) {
+          ColdBlocks.insert(SubRegion.begin(), SubRegion.end());
+
+          LLVM_DEBUG({
+            for (auto *Block : SubRegion)
+              dbgs() << "  contains cold block:" << Block->getName() << "\n";
+          });
+
+          OutliningWorklist.emplace_back(
+              std::make_pair(SubRegion[0], std::move(CE)));
+          ++OutlinedFunctionID;
+        } else {
+          // The cold block region cannot be outlined.
+          for (auto *Block : SubRegion)
+            if ((DT->dominates(BB, Block) && PDT->dominates(Block, BB)) ||
+                (PDT->dominates(BB, Block) && DT->dominates(Block, BB)))
+              // Will skip this cold block in the loop to save the compile time
+              CannotBeOutlinedColdBlocks.insert(Block);
+        }
+      } while (!Region.empty());
 
-      OutliningWorklist.emplace_back(std::move(Region));
       ++NumColdRegionsFound;
     }
   }
 
   if (OutliningWorklist.empty())
-    return Changed;
+    return false;
 
   // Outline single-entry cold regions, splitting up larger regions as needed.
-  unsigned OutlinedFunctionID = 1;
   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
   CodeExtractorAnalysisCache CEAC(F);
-  do {
-    OutliningRegion Region = OutliningWorklist.pop_back_val();
-    assert(!Region.empty() && "Empty outlining region in worklist");
-    do {
-      BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
-      LLVM_DEBUG({
-        dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
-        for (BasicBlock *BB : SubRegion)
-          BB->dump();
-      });
-
-      Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
-                                             ORE, AC, OutlinedFunctionID);
-      if (Outlined) {
-        ++OutlinedFunctionID;
-        Changed = true;
-      }
-    } while (!Region.empty());
-  } while (!OutliningWorklist.empty());
+  for (auto &BCE : OutliningWorklist) {
+    Function *Outlined =
+        extractColdRegion(*BCE.first, BCE.second, CEAC, BFI, TTI, ORE);
+    assert(Outlined && "Should be outlined");
+    (void)Outlined;
+  }
 
-  return Changed;
+  return true;
 }
 
 bool HotColdSplitting::run(Module &M) {
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 4176d56..77ca36d 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1471,7 +1471,6 @@ private:
         OMPRTL_omp_get_num_threads,
         OMPRTL_omp_in_parallel,
         OMPRTL_omp_get_cancellation,
-        OMPRTL_omp_get_thread_limit,
         OMPRTL_omp_get_supported_active_levels,
         OMPRTL_omp_get_level,
         OMPRTL_omp_get_ancestor_thread_num,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ed47de2..33ed1d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1543,11 +1543,14 @@ static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
   return !losesInfo;
 }
 
-static Type *shrinkFPConstant(ConstantFP *CFP) {
+static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
   if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
     return nullptr;  // No constant folding of this.
+  // See if the value can be truncated to bfloat and then reextended.
+  if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat()))
+    return Type::getBFloatTy(CFP->getContext());
   // See if the value can be truncated to half and then reextended.
-  if (fitsInFPType(CFP, APFloat::IEEEhalf()))
+  if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf()))
     return Type::getHalfTy(CFP->getContext());
   // See if the value can be truncated to float and then reextended.
   if (fitsInFPType(CFP, APFloat::IEEEsingle()))
@@ -1562,7 +1565,7 @@ static Type *shrinkFPConstant(ConstantFP *CFP) {
 
 // Determine if this is a vector of ConstantFPs and if so, return the minimal
 // type we can safely truncate all elements to.
-static Type *shrinkFPConstantVector(Value *V) {
+static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) {
   auto *CV = dyn_cast<Constant>(V);
   auto *CVVTy = dyn_cast<FixedVectorType>(V->getType());
   if (!CV || !CVVTy)
@@ -1582,7 +1585,7 @@ static Type *shrinkFPConstantVector(Value *V) {
     if (!CFP)
       return nullptr;
 
-    Type *T = shrinkFPConstant(CFP);
+    Type *T = shrinkFPConstant(CFP, PreferBFloat);
     if (!T)
       return nullptr;
 
@@ -1597,7 +1600,7 @@ static Type *shrinkFPConstantVector(Value *V) {
 }
 
 /// Find the minimum FP type we can safely truncate to.
-static Type *getMinimumFPType(Value *V) {
+static Type *getMinimumFPType(Value *V, bool PreferBFloat) {
   if (auto *FPExt = dyn_cast<FPExtInst>(V))
     return FPExt->getOperand(0)->getType();
 
@@ -1605,7 +1608,7 @@ static Type *getMinimumFPType(Value *V) {
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
   if (auto *CFP = dyn_cast<ConstantFP>(V))
-    if (Type *T = shrinkFPConstant(CFP))
+    if (Type *T = shrinkFPConstant(CFP, PreferBFloat))
       return T;
 
   // We can only correctly find a minimum type for a scalable vector when it is
@@ -1617,7 +1620,7 @@ static Type *getMinimumFPType(Value *V) {
 
   // Try to shrink a vector of FP constants. This returns nullptr on scalable
   // vectors
-  if (Type *T = shrinkFPConstantVector(V))
+  if (Type *T = shrinkFPConstantVector(V, PreferBFloat))
     return T;
 
   return V->getType();
@@ -1686,8 +1689,10 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   Type *Ty = FPT.getType();
   auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
   if (BO && BO->hasOneUse()) {
-    Type *LHSMinType = getMinimumFPType(BO->getOperand(0));
-    Type *RHSMinType = getMinimumFPType(BO->getOperand(1));
+    Type *LHSMinType =
+        getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy());
+    Type *RHSMinType =
+        getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy());
     unsigned OpWidth = BO->getType()->getFPMantissaWidth();
     unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
     unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4af455c..87c8dca 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2387,6 +2387,20 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
     return NonNull;
   }
 
+  if (match(V, m_SExtLike(m_Value(A)))) {
+    if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+                                         DoesConsume, Depth))
+      return Builder ? Builder->CreateSExt(AV, V->getType()) : NonNull;
+    return nullptr;
+  }
+
+  if (match(V, m_Trunc(m_Value(A)))) {
+    if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+                                         DoesConsume, Depth))
+      return Builder ? Builder->CreateTrunc(AV, V->getType()) : NonNull;
+    return nullptr;
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 393afc9..33add6d 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -348,7 +348,7 @@ private:
   void instrumentGlobals();
 
   Value *getPC(IRBuilder<> &IRB);
-  Value *getSP(IRBuilder<> &IRB);
+  Value *getFP(IRBuilder<> &IRB);
   Value *getFrameRecordInfo(IRBuilder<> &IRB);
 
   void instrumentPersonalityFunctions();
@@ -1148,7 +1148,7 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
   // Extract some entropy from the stack pointer for the tags.
   // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
   // between functions).
-  Value *StackPointerLong = getSP(IRB);
+  Value *StackPointerLong = getFP(IRB);
   Value *StackTag =
       applyTagMask(IRB, IRB.CreateXor(StackPointerLong,
                                       IRB.CreateLShr(StackPointerLong, 20)));
@@ -1165,7 +1165,7 @@ Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
 }
 
 Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB) {
-  Value *StackPointerLong = getSP(IRB);
+  Value *StackPointerLong = getFP(IRB);
   Value *UARTag =
       applyTagMask(IRB, IRB.CreateLShr(StackPointerLong, PointerTagShift));
 
@@ -1232,7 +1232,7 @@ Value *HWAddressSanitizer::getPC(IRBuilder<> &IRB) {
   return IRB.CreatePtrToInt(IRB.GetInsertBlock()->getParent(), IntptrTy);
 }
 
-Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) {
+Value *HWAddressSanitizer::getFP(IRBuilder<> &IRB) {
   if (!CachedSP) {
     // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
     // first).
@@ -1251,7 +1251,7 @@ Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) {
 Value *HWAddressSanitizer::getFrameRecordInfo(IRBuilder<> &IRB) {
   // Prepare ring buffer data.
   Value *PC = getPC(IRB);
-  Value *SP = getSP(IRB);
+  Value *SP = getFP(IRB);
 
   // Mix SP and PC.
   // Assumptions:
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index db05c63..9b6a39e 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -499,6 +499,8 @@ static Decomposition decompose(Value *V,
   if (!Ty->isIntegerTy() || Ty->getIntegerBitWidth() > 64)
     return V;
 
+  bool IsKnownNonNegative = false;
+
   // Decompose \p V used with a signed predicate.
   if (IsSigned) {
     if (auto *CI = dyn_cast<ConstantInt>(V)) {
@@ -507,6 +509,14 @@ static Decomposition decompose(Value *V,
     }
     Value *Op0;
     Value *Op1;
+
+    if (match(V, m_SExt(m_Value(Op0))))
+      V = Op0;
+    else if (match(V, m_NNegZExt(m_Value(Op0)))) {
+      V = Op0;
+      IsKnownNonNegative = true;
+    }
+
     if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1))))
       return MergeResults(Op0, Op1, IsSigned);
 
@@ -529,7 +539,7 @@ static Decomposition decompose(Value *V,
       }
     }
 
-    return V;
+    return {V, IsKnownNonNegative};
   }
 
   if (auto *CI = dyn_cast<ConstantInt>(V)) {
@@ -539,7 +549,6 @@ static Decomposition decompose(Value *V,
   }
 
   Value *Op0;
-  bool IsKnownNonNegative = false;
   if (match(V, m_ZExt(m_Value(Op0)))) {
     IsKnownNonNegative = true;
     V = Op0;
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 9235850..6ce9eb3 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -47,11 +47,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "correlated-value-propagation"
 
-static cl::opt<bool> CanonicalizeICmpPredicatesToUnsigned(
-    "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden,
-    cl::desc("Enables canonicalization of signed relational predicates to "
-             "unsigned (e.g. sgt => ugt)"));
-
 STATISTIC(NumPhis,      "Number of phis propagated");
 STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
 STATISTIC(NumSelects,   "Number of selects propagated");
@@ -90,6 +85,8 @@ STATISTIC(NumSaturating,
     "Number of saturating arithmetics converted to normal arithmetics");
 STATISTIC(NumNonNull, "Number of function pointer arguments marked non-null");
 STATISTIC(NumMinMax, "Number of llvm.[us]{min,max} intrinsics removed");
+STATISTIC(NumSMinMax,
+          "Number of llvm.s{min,max} intrinsics simplified to unsigned");
 STATISTIC(NumUDivURemsNarrowedExpanded,
           "Number of bound udiv's/urem's expanded");
 STATISTIC(NumZExt, "Number of non-negative deductions");
@@ -289,9 +286,6 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
 }
 
 static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) {
-  if (!CanonicalizeICmpPredicatesToUnsigned)
-    return false;
-
   // Only for signed relational comparisons of scalar integers.
   if (Cmp->getType()->isVectorTy() ||
       !Cmp->getOperand(0)->getType()->isIntegerTy())
@@ -528,17 +522,40 @@ static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) {
 }
 
 // See if this min/max intrinsic always picks it's one specific operand.
+// If not, check whether we can canonicalize signed minmax into unsigned version
 static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) {
   CmpInst::Predicate Pred = CmpInst::getNonStrictPredicate(MM->getPredicate());
-  LazyValueInfo::Tristate Result = LVI->getPredicateAt(
-      Pred, MM->getLHS(), MM->getRHS(), MM, /*UseBlockValue=*/true);
-  if (Result == LazyValueInfo::Unknown)
-    return false;
+  ConstantRange LHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(0),
+                                                    /*UndefAllowed*/ false);
+  ConstantRange RHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(1),
+                                                    /*UndefAllowed*/ false);
+  if (LHS_CR.icmp(Pred, RHS_CR)) {
+    ++NumMinMax;
+    MM->replaceAllUsesWith(MM->getLHS());
+    MM->eraseFromParent();
+    return true;
+  }
+  if (RHS_CR.icmp(Pred, LHS_CR)) {
+    ++NumMinMax;
+    MM->replaceAllUsesWith(MM->getRHS());
+    MM->eraseFromParent();
+    return true;
+  }
 
-  ++NumMinMax;
-  MM->replaceAllUsesWith(MM->getOperand(!Result));
-  MM->eraseFromParent();
-  return true;
+  if (MM->isSigned() &&
+      ConstantRange::areInsensitiveToSignednessOfICmpPredicate(LHS_CR,
+                                                               RHS_CR)) {
+    ++NumSMinMax;
+    IRBuilder<> B(MM);
+    MM->replaceAllUsesWith(B.CreateBinaryIntrinsic(
+        MM->getIntrinsicID() == Intrinsic::smin ? Intrinsic::umin
+                                                : Intrinsic::umax,
+        MM->getLHS(), MM->getRHS()));
+    MM->eraseFromParent();
+    return true;
+  }
+
+  return false;
 }
 
 // Rewrite this with.overflow intrinsic as non-overflowing.
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 1bf50d7..851eab0 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -1221,6 +1221,7 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
     Value::use_iterator I, E, Next;
     for (I = V->use_begin(), E = V->use_end(); I != E;) {
       Use &U = *I;
+      User *CurUser = U.getUser();
 
       // Some users may see the same pointer operand in multiple operands. Skip
       // to the next instruction.
@@ -1231,11 +1232,10 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
         // If V is used as the pointer operand of a compatible memory operation,
         // sets the pointer operand to NewV. This replacement does not change
         // the element type, so the resultant load/store is still valid.
-        U.set(NewV);
+        CurUser->replaceUsesOfWith(V, NewV);
         continue;
       }
 
-      User *CurUser = U.getUser();
       // Skip if the current user is the new value itself.
       if (CurUser == NewV)
         continue;
@@ -1311,10 +1311,13 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
 
           while (isa<PHINode>(InsertPos))
             ++InsertPos;
-          U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+          // This instruction may contain multiple uses of V, update them all.
+          CurUser->replaceUsesOfWith(
+              V, new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
         } else {
-          U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
-                                               V->getType()));
+          CurUser->replaceUsesOfWith(
+              V, ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                                V->getType()));
         }
       }
     }
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 627c863..08021f3b 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -7033,7 +7033,6 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
       // SCEVExpander for both use in preheader and latch
       const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
       SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
-      SCEVExpanderCleaner ExpCleaner(Expander);
 
       assert(Expander.isSafeToExpand(TermValueS) &&
              "Terminating value was checked safe in canFoldTerminatingCondition");
@@ -7064,10 +7063,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
 
       BI->setCondition(NewTermCond);
 
+      Expander.clear();
       OldTermCond->eraseFromParent();
       DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
-
-      ExpCleaner.markResultUsed();
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index f4f3070..260f31b 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -291,9 +291,9 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   InstructionCost TotalSpeculationCost = 0;
   unsigned NotHoistedInstCount = 0;
   for (const auto &I : FromBlock) {
-    // Make note of any DPValues that need hoisting.
-    for (DbgRecord &DR : I.getDbgValueRange()) {
-      DPValue &DPV = cast<DPValue>(DR);
+    // Make note of any DPValues that need hoisting. DPLabels
+    // get left behind just like llvm.dbg.labels.
+    for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
       if (HasNoUnhoistedInstr(DPV.location_ops()))
         DPValuesToHoist[DPV.getInstruction()].push_back(&DPV);
     }
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 7fd6759..5bb109a 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -386,7 +386,15 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
   SmallVector<DPValue *, 8> ToBeRemoved;
   SmallDenseSet<DebugVariable> VariableSet;
   for (auto &I : reverse(*BB)) {
-    for (DPValue &DPV : reverse(DPValue::filter(I.getDbgValueRange()))) {
+    for (DbgRecord &DR : reverse(I.getDbgValueRange())) {
+      if (isa<DPLabel>(DR)) {
+        // Emulate existing behaviour (see comment below for dbg.declares).
+        // FIXME: Don't do this.
+        VariableSet.clear();
+        continue;
+      }
+
+      DPValue &DPV = cast<DPValue>(DR);
       // Skip declare-type records, as the debug intrinsic method only works
       // on dbg.value intrinsics.
       if (DPV.getType() == DPValue::LocationType::Declare) {
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 8ebcf0c..bab0651 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1585,8 +1585,30 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     return cast<DILocalVariable>(NewVar);
   };
 
-  auto UpdateDPValuesOnInst = [&](Instruction &I) -> void {
-    for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
+  auto UpdateDbgLabel = [&](auto *LabelRecord) {
+    // Point the label record to a fresh label within the new function if
+    // the record was not inlined from some other function.
+    if (LabelRecord->getDebugLoc().getInlinedAt())
+      return;
+    DILabel *OldLabel = LabelRecord->getLabel();
+    DINode *&NewLabel = RemappedMetadata[OldLabel];
+    if (!NewLabel) {
+      DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram(
+          *OldLabel->getScope(), *NewSP, Ctx, Cache);
+      NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(),
+                              OldLabel->getFile(), OldLabel->getLine());
+    }
+    LabelRecord->setLabel(cast<DILabel>(NewLabel));
+  };
+
+  auto UpdateDbgRecordsOnInst = [&](Instruction &I) -> void {
+    for (DbgRecord &DR : I.getDbgValueRange()) {
+      if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+        UpdateDbgLabel(DPL);
+        continue;
+      }
+
+      DPValue &DPV = cast<DPValue>(DR);
       // Apply the two updates that dbg.values get: invalid operands, and
       // variable metadata fixup.
       if (any_of(DPV.location_ops(), IsInvalidLocation)) {
@@ -1599,13 +1621,11 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
       }
       if (!DPV.getDebugLoc().getInlinedAt())
         DPV.setVariable(GetUpdatedDIVariable(DPV.getVariable()));
-      DPV.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DPV.getDebugLoc(),
-                                                           *NewSP, Ctx, Cache));
     }
   };
 
   for (Instruction &I : instructions(NewFunc)) {
-    UpdateDPValuesOnInst(I);
+    UpdateDbgRecordsOnInst(I);
 
     auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
     if (!DII)
@@ -1614,17 +1634,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     // Point the intrinsic to a fresh label within the new function if the
     // intrinsic was not inlined from some other function.
     if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
-      if (DLI->getDebugLoc().getInlinedAt())
-        continue;
-      DILabel *OldLabel = DLI->getLabel();
-      DINode *&NewLabel = RemappedMetadata[OldLabel];
-      if (!NewLabel) {
-        DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram(
-            *OldLabel->getScope(), *NewSP, Ctx, Cache);
-        NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(),
-                                OldLabel->getFile(), OldLabel->getLine());
-      }
-      DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel));
+      UpdateDbgLabel(DLI);
       continue;
     }
 
@@ -1658,6 +1668,9 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     if (const DebugLoc &DL = I.getDebugLoc())
       I.setDebugLoc(
           DebugLoc::replaceInlinedAtSubprogram(DL, *NewSP, Ctx, Cache));
+    for (DbgRecord &DR : I.getDbgValueRange())
+      DR.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DR.getDebugLoc(),
+                                                          *NewSP, Ctx, Cache));
 
     // Loop info metadata may contain line locations. Fix them up.
     auto updateLoopInfoLoc = [&Ctx, &Cache, NewSP](Metadata *MD) -> Metadata * {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 08fdd3b..2ff7c01 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -111,8 +111,7 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) {
 
 void StackInfoBuilder::visit(Instruction &Inst) {
   // Visit non-intrinsic debug-info records attached to Inst.
-  for (DbgRecord &DR : Inst.getDbgValueRange()) {
-    DPValue &DPV = cast<DPValue>(DR);
+  for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) {
     auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 6e46469..91ab279 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -538,6 +538,11 @@ Value *Mapper::mapValue(const Value *V) {
 }
 
 void Mapper::remapDPValue(DbgRecord &DR) {
+  if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+    DPL->setLabel(cast<DILabel>(mapMetadata(DPL->getLabel())));
+    return;
+  }
+
   DPValue &V = cast<DPValue>(DR);
   // Remap variables and DILocations.
   auto *MappedVar = mapMetadata(V.getVariable());
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4e33474..de4e56f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2422,18 +2422,25 @@ private:
   /// \param TE Tree entry checked for permutation.
   /// \param VL List of scalars (a subset of the TE scalar), checked for
   /// permutations. Must form single-register vector.
+  /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
+  /// commands to build the mask using the original vector value, without
+  /// relying on the potential reordering.
   /// \returns ShuffleKind, if gathered values can be represented as shuffles of
   /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
   std::optional<TargetTransformInfo::ShuffleKind>
   isGatherShuffledSingleRegisterEntry(
       const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
+      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
+      bool ForOrder);
 
   /// Checks if the gathered \p VL can be represented as multi-register
   /// shuffle(s) of previous tree entries.
   /// \param TE Tree entry checked for permutation.
   /// \param VL List of scalars (a subset of the TE scalar), checked for
   /// permutations.
+  /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
+  /// commands to build the mask using the original vector value, without
+  /// relying on the potential reordering.
   /// \returns per-register series of ShuffleKind, if gathered values can be
   /// represented as shuffles of previous tree entries. \p Mask is filled with
   /// the shuffle mask (also on per-register base).
@@ -2441,7 +2448,7 @@ private:
   isGatherShuffledEntry(
       const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
       SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
-      unsigned NumParts);
+      unsigned NumParts, bool ForOrder = false);
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
@@ -3788,65 +3795,163 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
 std::optional<BoUpSLP::OrdersType>
 BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
   assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
-  unsigned NumScalars = TE.Scalars.size();
+  // Try to find subvector extract/insert patterns and reorder only such
+  // patterns.
+  SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
+  Type *ScalarTy = GatheredScalars.front()->getType();
+  int NumScalars = GatheredScalars.size();
+  if (!isValidElementType(ScalarTy))
+    return std::nullopt;
+  auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
+  int NumParts = TTI->getNumberOfParts(VecTy);
+  if (NumParts == 0 || NumParts >= NumScalars)
+    NumParts = 1;
+  SmallVector<int> ExtractMask;
+  SmallVector<int> Mask;
+  SmallVector<SmallVector<const TreeEntry *>> Entries;
+  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
+      tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
+      isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
+                            /*ForOrder=*/true);
+  // No shuffled operands - ignore.
+  if (GatherShuffles.empty() && ExtractShuffles.empty())
+    return std::nullopt;
   OrdersType CurrentOrder(NumScalars, NumScalars);
-  SmallVector<int> Positions;
-  SmallBitVector UsedPositions(NumScalars);
-  const TreeEntry *STE = nullptr;
-  // Try to find all gathered scalars that are gets vectorized in other
-  // vectorize node. Here we can have only one single tree vector node to
-  // correctly identify order of the gathered scalars.
-  for (unsigned I = 0; I < NumScalars; ++I) {
-    Value *V = TE.Scalars[I];
-    if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
-      continue;
-    if (const auto *LocalSTE = getTreeEntry(V)) {
-      if (!STE)
-        STE = LocalSTE;
-      else if (STE != LocalSTE)
-        // Take the order only from the single vector node.
-        return std::nullopt;
-      unsigned Lane =
-          std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
-      if (Lane >= NumScalars)
-        return std::nullopt;
-      if (CurrentOrder[Lane] != NumScalars) {
-        if (Lane != I)
+  if (GatherShuffles.size() == 1 &&
+      *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+      Entries.front().front()->isSame(TE.Scalars)) {
+    // Perfect match in the graph, will reuse the previously vectorized
+    // node. Cost is 0.
+    std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
+    return CurrentOrder;
+  }
+  auto IsSplatMask = [](ArrayRef<int> Mask) {
+    int SingleElt = PoisonMaskElem;
+    return all_of(Mask, [&](int I) {
+      if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
+        SingleElt = I;
+      return I == PoisonMaskElem || I == SingleElt;
+    });
+  };
+  // Exclusive broadcast mask - ignore.
+  if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
+       (Entries.size() != 1 ||
+        Entries.front().front()->ReorderIndices.empty())) ||
+      (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
+    return std::nullopt;
+  SmallBitVector ShuffledSubMasks(NumParts);
+  auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
+                                  ArrayRef<int> Mask, int PartSz, int NumParts,
+                                  function_ref<unsigned(unsigned)> GetVF) {
+    for (int I : seq<int>(0, NumParts)) {
+      if (ShuffledSubMasks.test(I))
+        continue;
+      const int VF = GetVF(I);
+      if (VF == 0)
+        continue;
+      MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
+      // Shuffle of at least 2 vectors - ignore.
+      if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
+        std::fill(Slice.begin(), Slice.end(), NumScalars);
+        ShuffledSubMasks.set(I);
+        continue;
+      }
+      // Try to include as much elements from the mask as possible.
+      int FirstMin = INT_MAX;
+      int SecondVecFound = false;
+      for (int K : seq<int>(0, PartSz)) {
+        int Idx = Mask[I * PartSz + K];
+        if (Idx == PoisonMaskElem) {
+          Value *V = GatheredScalars[I * PartSz + K];
+          if (isConstant(V) && !isa<PoisonValue>(V)) {
+            SecondVecFound = true;
+            break;
+          }
           continue;
-        UsedPositions.reset(CurrentOrder[Lane]);
+        }
+        if (Idx < VF) {
+          if (FirstMin > Idx)
+            FirstMin = Idx;
+        } else {
+          SecondVecFound = true;
+          break;
+        }
       }
-      // The partial identity (where only some elements of the gather node are
-      // in the identity order) is good.
-      CurrentOrder[Lane] = I;
-      UsedPositions.set(I);
-    }
-  }
-  // Need to keep the order if we have a vector entry and at least 2 scalars or
-  // the vectorized entry has just 2 scalars.
-  if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
-    auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
-      for (unsigned I = 0; I < NumScalars; ++I)
-        if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
-          return false;
-      return true;
-    };
-    if (IsIdentityOrder(CurrentOrder))
-      return OrdersType();
-    auto *It = CurrentOrder.begin();
-    for (unsigned I = 0; I < NumScalars;) {
-      if (UsedPositions.test(I)) {
-        ++I;
+      FirstMin = (FirstMin / PartSz) * PartSz;
+      // Shuffle of at least 2 vectors - ignore.
+      if (SecondVecFound) {
+        std::fill(Slice.begin(), Slice.end(), NumScalars);
+        ShuffledSubMasks.set(I);
         continue;
       }
-      if (*It == NumScalars) {
-        *It = I;
-        ++I;
+      for (int K : seq<int>(0, PartSz)) {
+        int Idx = Mask[I * PartSz + K];
+        if (Idx == PoisonMaskElem)
+          continue;
+        Idx -= FirstMin;
+        if (Idx >= PartSz) {
+          SecondVecFound = true;
+          break;
+        }
+        if (CurrentOrder[I * PartSz + Idx] >
+                static_cast<unsigned>(I * PartSz + K) &&
+            CurrentOrder[I * PartSz + Idx] !=
+                static_cast<unsigned>(I * PartSz + Idx))
+          CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
+      }
+      // Shuffle of at least 2 vectors - ignore.
+      if (SecondVecFound) {
+        std::fill(Slice.begin(), Slice.end(), NumScalars);
+        ShuffledSubMasks.set(I);
+        continue;
       }
-      ++It;
     }
-    return std::move(CurrentOrder);
+  };
+  int PartSz = NumScalars / NumParts;
+  if (!ExtractShuffles.empty())
+    TransformMaskToOrder(
+        CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
+          if (!ExtractShuffles[I])
+            return 0U;
+          unsigned VF = 0;
+          for (unsigned Idx : seq<unsigned>(0, PartSz)) {
+            int K = I * PartSz + Idx;
+            if (ExtractMask[K] == PoisonMaskElem)
+              continue;
+            if (!TE.ReuseShuffleIndices.empty())
+              K = TE.ReuseShuffleIndices[K];
+            if (!TE.ReorderIndices.empty())
+              K = std::distance(TE.ReorderIndices.begin(),
+                                find(TE.ReorderIndices, K));
+            auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
+            if (!EI)
+              continue;
+            VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
+                                  ->getElementCount()
+                                  .getKnownMinValue());
+          }
+          return VF;
+        });
+  // Check special corner case - single shuffle of the same entry.
+  if (GatherShuffles.size() == 1 && NumParts != 1) {
+    if (ShuffledSubMasks.any())
+      return std::nullopt;
+    PartSz = NumScalars;
+    NumParts = 1;
   }
-  return std::nullopt;
+  if (!Entries.empty())
+    TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
+      if (!GatherShuffles[I])
+        return 0U;
+      return std::max(Entries[I].front()->getVectorFactor(),
+                      Entries[I].back()->getVectorFactor());
+    });
+  int NumUndefs =
+      count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
+  if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
+    return std::nullopt;
+  return std::move(CurrentOrder);
 }
 
 namespace {
@@ -4168,9 +4273,59 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     //                           0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
     //                           element 3 is used twice in the second submask.
     unsigned Sz = TE.Scalars.size();
-    if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
-                                                     Sz))
+    if (TE.State == TreeEntry::NeedToGather) {
+      if (std::optional<OrdersType> CurrentOrder =
+              findReusedOrderedScalars(TE)) {
+        SmallVector<int> Mask;
+        fixupOrderingIndices(*CurrentOrder);
+        inversePermutation(*CurrentOrder, Mask);
+        ::addMask(Mask, TE.ReuseShuffleIndices);
+        OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
+        unsigned Sz = TE.Scalars.size();
+        for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
+          for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
+            if (Idx != PoisonMaskElem)
+              Res[Idx + K * Sz] = I + K * Sz;
+        }
+        return std::move(Res);
+      }
+    }
+    if (Sz == 2 && TE.getVectorFactor() == 4 &&
+        TTI->getNumberOfParts(FixedVectorType::get(
+            TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
       return std::nullopt;
+    if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
+                                                     Sz)) {
+      SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
+      if (TE.ReorderIndices.empty())
+        std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
+      else
+        inversePermutation(TE.ReorderIndices, ReorderMask);
+      ::addMask(ReorderMask, TE.ReuseShuffleIndices);
+      unsigned VF = ReorderMask.size();
+      OrdersType ResOrder(VF, VF);
+      unsigned NumParts = VF / Sz;
+      SmallBitVector UsedVals(NumParts);
+      for (unsigned I = 0; I < VF; I += Sz) {
+        int Val = PoisonMaskElem;
+        unsigned UndefCnt = 0;
+        if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
+                   [&](int Idx) {
+                     if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
+                       Val = Idx;
+                     if (Idx == PoisonMaskElem)
+                       ++UndefCnt;
+                     return Idx != PoisonMaskElem && Idx != Val;
+                   }) ||
+            Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
+            UndefCnt > Sz / 2)
+          return std::nullopt;
+        UsedVals.set(Val);
+        for (unsigned K = 0; K < NumParts; ++K)
+          ResOrder[Val + Sz * K] = I + K;
+      }
+      return std::move(ResOrder);
+    }
     unsigned VF = TE.getVectorFactor();
     // Try build correct order for extractelement instructions.
     SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
@@ -4208,7 +4363,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
       std::advance(It, Sz);
     }
-    if (all_of(enumerate(ResOrder),
+    if (TE.State == TreeEntry::NeedToGather &&
+        all_of(enumerate(ResOrder),
                [](const auto &Data) { return Data.index() == Data.value(); }))
       return std::nullopt; // No need to reorder.
     return std::move(ResOrder);
@@ -4298,11 +4454,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       OrdersType CurrentOrder;
       bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
                                    /*ResizeAllowed=*/true);
-      if (Reuse || !CurrentOrder.empty()) {
-        if (!CurrentOrder.empty())
-          fixupOrderingIndices(CurrentOrder);
+      if (Reuse || !CurrentOrder.empty())
         return std::move(CurrentOrder);
-      }
     }
     // If the gather node is <undef, v, .., poison> and
     // insertelement poison, v, 0 [+ permute]
@@ -4335,8 +4488,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
         InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
             Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
             PoisonValue::get(Ty), *It);
-        if (InsertFirstCost + PermuteCost < InsertIdxCost)
+        if (InsertFirstCost + PermuteCost < InsertIdxCost) {
+          OrdersType Order(Sz, Sz);
+          Order[Idx] = 0;
           return std::move(Order);
+        }
       }
     }
     if (isSplat(TE.Scalars))
@@ -4392,6 +4548,28 @@ void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
     std::iota(It, std::next(It, Sz), 0);
 }
 
+static void combineOrders(MutableArrayRef<unsigned> Order,
+                          ArrayRef<unsigned> SecondaryOrder) {
+  assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
+         "Expected same size of orders");
+  unsigned Sz = Order.size();
+  SmallBitVector UsedIndices(Sz);
+  for (unsigned Idx : seq<unsigned>(0, Sz)) {
+    if (Order[Idx] != Sz)
+      UsedIndices.set(Order[Idx]);
+  }
+  if (SecondaryOrder.empty()) {
+    for (unsigned Idx : seq<unsigned>(0, Sz))
+      if (Order[Idx] == Sz && !UsedIndices.test(Idx))
+        Order[Idx] = Idx;
+  } else {
+    for (unsigned Idx : seq<unsigned>(0, Sz))
+      if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
+          !UsedIndices.test(SecondaryOrder[Idx]))
+        Order[Idx] = SecondaryOrder[Idx];
+  }
+}
+
 void BoUpSLP::reorderTopToBottom() {
   // Maps VF to the graph nodes.
   DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -4560,18 +4738,46 @@ void BoUpSLP::reorderTopToBottom() {
     }
     if (OrdersUses.empty())
       continue;
+    auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
+      const unsigned Sz = Order.size();
+      for (unsigned Idx : seq<unsigned>(0, Sz))
+        if (Idx != Order[Idx] && Order[Idx] != Sz)
+          return false;
+      return true;
+    };
     // Choose the most used order.
-    ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
-    unsigned Cnt = OrdersUses.front().second;
-    for (const auto &Pair : drop_begin(OrdersUses)) {
-      if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+    unsigned IdentityCnt = 0;
+    unsigned FilledIdentityCnt = 0;
+    OrdersType IdentityOrder(VF, VF);
+    for (auto &Pair : OrdersUses) {
+      if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+        if (!Pair.first.empty())
+          FilledIdentityCnt += Pair.second;
+        IdentityCnt += Pair.second;
+        combineOrders(IdentityOrder, Pair.first);
+      }
+    }
+    MutableArrayRef<unsigned> BestOrder = IdentityOrder;
+    unsigned Cnt = IdentityCnt;
+    for (auto &Pair : OrdersUses) {
+      // Prefer identity order. But, if filled identity found (non-empty order)
+      // with same number of uses, as the new candidate order, we can choose
+      // this candidate order.
+      if (Cnt < Pair.second ||
+          (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
+           Cnt == Pair.second && !BestOrder.empty() &&
+           IsIdentityOrder(BestOrder))) {
+        combineOrders(Pair.first, BestOrder);
         BestOrder = Pair.first;
         Cnt = Pair.second;
+      } else {
+        combineOrders(BestOrder, Pair.first);
       }
     }
     // Set order of the user node.
-    if (BestOrder.empty())
+    if (IsIdentityOrder(BestOrder))
       continue;
+    fixupOrderingIndices(BestOrder);
     SmallVector<int> Mask;
     inversePermutation(BestOrder, Mask);
     SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
@@ -4685,7 +4891,7 @@ bool BoUpSLP::canReorderOperands(
 
 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SetVector<TreeEntry *> OrderedEntries;
-  DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+  DenseSet<const TreeEntry *> GathersToOrders;
   // Find all reorderable leaf nodes with the given VF.
   // Currently the are vectorized loads,extracts without alternate operands +
   // some gathering of extracts.
@@ -4700,7 +4906,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
-        GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+        GathersToOrders.insert(TE.get());
     }
   }
 
@@ -4718,7 +4924,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
             (TE->State == TreeEntry::NeedToGather &&
-             GathersToOrders.count(TE))) ||
+             GathersToOrders.contains(TE))) ||
           TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
           !all_of(drop_begin(TE->UserTreeIndices),
                   [TE](const EdgeInfo &EI) {
@@ -4775,9 +4981,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         const auto Order = [&]() -> const OrdersType {
           if (OpTE->State == TreeEntry::NeedToGather ||
               !OpTE->ReuseShuffleIndices.empty())
-            return GathersToOrders.find(OpTE)->second;
+            return getReorderingData(*OpTE, /*TopToBottom=*/false)
+                .value_or(OrdersType(1));
           return OpTE->ReorderIndices;
         }();
+        // The order is partially ordered, skip it in favor of fully non-ordered
+        // orders.
+        if (Order.size() == 1)
+          continue;
         unsigned NumOps = count_if(
             Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
               return P.second == OpTE;
@@ -4805,9 +5016,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
               (IgnoreReorder && TE->Idx == 0))
             return true;
           if (TE->State == TreeEntry::NeedToGather) {
-            auto It = GathersToOrders.find(TE);
-            if (It != GathersToOrders.end())
-              return !It->second.empty();
+            if (GathersToOrders.contains(TE))
+              return !getReorderingData(*TE, /*TopToBottom=*/false)
+                          .value_or(OrdersType(1))
+                          .empty();
             return true;
           }
           return false;
@@ -4839,21 +5051,49 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             ++Res.first->second;
         }
       }
-      // Choose the best order.
-      ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
-      unsigned Cnt = OrdersUses.front().second;
-      for (const auto &Pair : drop_begin(OrdersUses)) {
-        if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+      if (OrdersUses.empty()) {
+        for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+          OrderedEntries.remove(Op.second);
+        continue;
+      }
+      auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
+        const unsigned Sz = Order.size();
+        for (unsigned Idx : seq<unsigned>(0, Sz))
+          if (Idx != Order[Idx] && Order[Idx] != Sz)
+            return false;
+        return true;
+      };
+      // Choose the most used order.
+      unsigned IdentityCnt = 0;
+      unsigned VF = Data.second.front().second->getVectorFactor();
+      OrdersType IdentityOrder(VF, VF);
+      for (auto &Pair : OrdersUses) {
+        if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+          IdentityCnt += Pair.second;
+          combineOrders(IdentityOrder, Pair.first);
+        }
+      }
+      MutableArrayRef<unsigned> BestOrder = IdentityOrder;
+      unsigned Cnt = IdentityCnt;
+      for (auto &Pair : OrdersUses) {
+        // Prefer identity order. But, if filled identity found (non-empty
+        // order) with same number of uses, as the new candidate order, we can
+        // choose this candidate order.
+        if (Cnt < Pair.second) {
+          combineOrders(Pair.first, BestOrder);
           BestOrder = Pair.first;
           Cnt = Pair.second;
+        } else {
+          combineOrders(BestOrder, Pair.first);
         }
       }
-      // Set order of the user node (reordering of operands and user nodes).
-      if (BestOrder.empty()) {
+      // Set order of the user node.
+      if (IsIdentityOrder(BestOrder)) {
         for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
           OrderedEntries.remove(Op.second);
         continue;
       }
+      fixupOrderingIndices(BestOrder);
       // Erase operands from OrderedEntries list and adjust their orders.
       VisitedOps.clear();
       SmallVector<int> Mask;
@@ -7472,6 +7712,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       }
       V1 = Constant::getNullValue(
           FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+      // Not identity/broadcast? Try to see if the original vector is better.
+      if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
+          CommonVF == CommonMask.size() &&
+          any_of(enumerate(CommonMask),
+                 [](const auto &&P) {
+                   return P.value() != PoisonMaskElem &&
+                          static_cast<unsigned>(P.value()) != P.index();
+                 }) &&
+          any_of(CommonMask,
+                 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
+        SmallVector<int> ReorderMask;
+        inversePermutation(E->ReorderIndices, ReorderMask);
+        ::addMask(CommonMask, ReorderMask);
+      }
     } else if (V1 && P2.isNull()) {
       // Shuffle single vector.
       CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
@@ -9433,7 +9687,7 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
 std::optional<TargetTransformInfo::ShuffleKind>
 BoUpSLP::isGatherShuffledSingleRegisterEntry(
     const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
+    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
   Entries.clear();
   // TODO: currently checking only for Scalars in the tree entry, need to count
   // reused elements too for better cost estimation.
@@ -9532,6 +9786,21 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       VToTEs.insert(TEPtr);
     }
     if (const TreeEntry *VTE = getTreeEntry(V)) {
+      if (ForOrder) {
+        if (VTE->State != TreeEntry::Vectorize) {
+          auto It = MultiNodeScalars.find(V);
+          if (It == MultiNodeScalars.end())
+            continue;
+          VTE = *It->getSecond().begin();
+          // Iterate through all vectorized nodes.
+          auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
+            return MTE->State == TreeEntry::Vectorize;
+          });
+          if (MIt == It->getSecond().end())
+            continue;
+          VTE = *MIt;
+        }
+      }
       Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
       if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
         continue;
@@ -9765,8 +10034,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   // scalar in the list.
   for (const std::pair<unsigned, int> &Pair : EntryLanes) {
     unsigned Idx = Part * VL.size() + Pair.second;
-    Mask[Idx] = Pair.first * VF +
-                Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+    Mask[Idx] =
+        Pair.first * VF +
+        (ForOrder ? std::distance(
+                        Entries[Pair.first]->Scalars.begin(),
+                        find(Entries[Pair.first]->Scalars, VL[Pair.second]))
+                  : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
     IsIdentity &= Mask[Idx] == Pair.second;
   }
   switch (Entries.size()) {
@@ -9791,8 +10064,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
 BoUpSLP::isGatherShuffledEntry(
     const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
-    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
-    unsigned NumParts) {
+    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
+    bool ForOrder) {
   assert(NumParts > 0 && NumParts < VL.size() &&
          "Expected positive number of registers.");
   Entries.clear();
@@ -9810,7 +10083,8 @@ BoUpSLP::isGatherShuffledEntry(
     ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
     SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
     std::optional<TTI::ShuffleKind> SubRes =
-        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
+        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
+                                            ForOrder);
     if (!SubRes)
       SubEntries.clear();
     Res.push_back(SubRes);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 240d4bd..a2a203c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -385,9 +385,6 @@ struct VPTransformState {
 
   VPValue2ValueTy VPValue2Value;
 
-  /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
-  Value *CanonicalIV = nullptr;
-
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 8c48d85..9b5e758 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -199,7 +199,7 @@ foreach(entry ${runtimes})
     list(APPEND prefixes "LLVM_LIBC")
     list(APPEND prefixes "LIBC_")
     # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode.
-    if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+    if(LLVM_LIBC_GPU_BUILD)
       list(APPEND prefixes "CUDA")
     endif()
   endif()
@@ -424,7 +424,7 @@ if(runtimes)
     endforeach()
   endif()
   if("libc" IN_LIST LLVM_ENABLE_PROJECTS AND
-      (LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES))
+      (LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD))
     if(LIBC_HDRGEN_EXE)
       set(hdrgen_exe ${LIBC_HDRGEN_EXE})
     else()
@@ -441,7 +441,12 @@ if(runtimes)
     set(libc_cmake_args "-DLIBC_HDRGEN_EXE=${hdrgen_exe}"
                         "-DLLVM_LIBC_FULL_BUILD=ON")
     list(APPEND extra_deps ${hdrgen_deps})
-    if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+    if(LLVM_LIBC_GPU_BUILD)
+      list(APPEND libc_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON")
+      # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode.
+      if(CUDAToolkit_ROOT)
+        list(APPEND libc_cmake_args "-DCUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+      endif()
       foreach(dep clang-offload-packager nvptx-arch amdgpu-arch)
         if(TARGET ${dep})
           list(APPEND extra_deps ${dep})
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll
index 20e0ef7..63149ad 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll
@@ -22,44 +22,44 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f64'
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f64'
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
 ; NEON-ARMPL-LABEL: 'frem_f64'
-; NEON-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ;
 ; NEON-SLEEF-LABEL: 'frem_f64'
-; NEON-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-ARMPL-LABEL: 'frem_f64'
-; SVE-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-SLEEF-LABEL: 'frem_f64'
-; SVE-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f64'
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f64'
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
@@ -83,55 +83,55 @@ define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 
 define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f32'
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 20 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f32'
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 20 for VF 4 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in
 ;
 ; NEON-ARMPL-LABEL: 'frem_f32'
-; NEON-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ;
 ; NEON-SLEEF-LABEL: 'frem_f32'
-; NEON-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-ARMPL-LABEL: 'frem_f32'
-; SVE-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-SLEEF-LABEL: 'frem_f32'
-; SVE-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f32'
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f32'
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
index c352892..497ade4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -197,17 +197,17 @@ define i32 @fdiv(i32 %arg) {
 
 define i32 @frem(i32 %arg) {
 ; CHECK-LABEL: 'frem'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F16 = frem <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8F16 = frem <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V16F16 = frem <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = frem <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = frem <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = frem <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = frem <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F16 = frem half undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4F16 = frem <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8F16 = frem <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16F16 = frem <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = frem float undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2F32 = frem <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4F32 = frem <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8F32 = frem <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = frem double undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2F64 = frem <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4F64 = frem <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = frem half undef, undef
diff --git a/llvm/test/Analysis/ScalarEvolution/exit-count-select-safe.ll b/llvm/test/Analysis/ScalarEvolution/exit-count-select-safe.ll
index 2af1309..d3cec77 100644
--- a/llvm/test/Analysis/ScalarEvolution/exit-count-select-safe.ll
+++ b/llvm/test/Analysis/ScalarEvolution/exit-count-select-safe.ll
@@ -177,9 +177,9 @@ define i32 @logical_or_3ops_redundant_uminseq_operand(i32 %n, i32 %m, i32 %k) {
 ; CHECK-NEXT:    %umin = call i32 @llvm.umin.i32(i32 %n, i32 %m)
 ; CHECK-NEXT:    --> (%n umin %m) U: full-set S: full-set Exits: (%n umin %m) LoopDispositions: { %loop: Invariant }
 ; CHECK-NEXT:    %cond_p3 = select i1 %cond_p0, i1 true, i1 %cond_p1
-; CHECK-NEXT:    --> (true + ((true + %cond_p0) umin_seq (true + %cond_p1))) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
+; CHECK-NEXT:    --> (true + ((true + %cond_p0) umin (true + %cond_p1))) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:    %cond = select i1 %cond_p3, i1 true, i1 %cond_p2
-; CHECK-NEXT:    --> (true + ((true + %cond_p0) umin_seq (true + %cond_p1) umin_seq (true + %cond_p2))) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
+; CHECK-NEXT:    --> (true + (((true + %cond_p0) umin (true + %cond_p1)) umin_seq (true + %cond_p2))) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:  Determining loop execution counts for: @logical_or_3ops_redundant_uminseq_operand
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((%n umin %m) umin_seq %k)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is -1
diff --git a/llvm/test/Bitcode/constant-splat.ll b/llvm/test/Bitcode/constant-splat.ll
new file mode 100644
index 0000000..2bcc3dd
--- /dev/null
+++ b/llvm/test/Bitcode/constant-splat.ll
@@ -0,0 +1,76 @@
+; RUN: llvm-as -use-constant-int-for-fixed-length-splat \
+; RUN:         -use-constant-fp-for-fixed-length-splat \
+; RUN:         -use-constant-int-for-scalable-splat \
+; RUN:         -use-constant-fp-for-scalable-splat \
+; RUN:   < %s | llvm-dis -use-constant-int-for-fixed-length-splat \
+; RUN:                   -use-constant-fp-for-fixed-length-splat \
+; RUN:                   -use-constant-int-for-scalable-splat \
+; RUN:                   -use-constant-fp-for-scalable-splat \
+; RUN:   | FileCheck %s
+
+; CHECK: @constant.splat.i1 = constant <1 x i1> splat (i1 true)
+@constant.splat.i1 = constant <1 x i1> splat (i1 true)
+
+; CHECK: @constant.splat.i32 = constant <5 x i32> splat (i32 7)
+@constant.splat.i32 = constant <5 x i32> splat (i32 7)
+
+; CHECK: @constant.splat.i128 = constant <7 x i128> splat (i128 85070591730234615870450834276742070272)
+@constant.splat.i128 = constant <7 x i128> splat (i128 85070591730234615870450834276742070272)
+
+; CHECK: @constant.splat.f16 = constant <2 x half> splat (half 0xHBC00)
+@constant.splat.f16 = constant <2 x half> splat (half 0xHBC00)
+
+; CHECK: @constant.splat.f32 = constant <4 x float> splat (float -2.000000e+00)
+@constant.splat.f32 = constant <4 x float> splat (float -2.000000e+00)
+
+; CHECK: @constant.splat.f64 = constant <6 x double> splat (double -3.000000e+00)
+@constant.splat.f64 = constant <6 x double> splat (double -3.000000e+00)
+
+; CHECK: @constant.splat.128 = constant <8 x fp128> splat (fp128 0xL00000000000000018000000000000000)
+@constant.splat.128 = constant <8 x fp128> splat (fp128 0xL00000000000000018000000000000000)
+
+; CHECK: @constant.splat.bf16 = constant <1 x bfloat> splat (bfloat 0xRC0A0)
+@constant.splat.bf16 = constant <1 x bfloat> splat (bfloat 0xRC0A0)
+
+; CHECK: @constant.splat.x86_fp80 = constant <3 x x86_fp80> splat (x86_fp80 0xK4000C8F5C28F5C28F800)
+@constant.splat.x86_fp80 = constant <3 x x86_fp80> splat (x86_fp80 0xK4000C8F5C28F5C28F800)
+
+; CHECK: @constant.splat.ppc_fp128 = constant <7 x ppc_fp128> splat (ppc_fp128 0xM80000000000000000000000000000000)
+@constant.splat.ppc_fp128 = constant <7 x ppc_fp128> splat (ppc_fp128 0xM80000000000000000000000000000000)
+
+define void @add_fixed_lenth_vector_splat_i32(<4 x i32> %a) {
+; CHECK: %add = add <4 x i32> %a, splat (i32 137)
+  %add = add <4 x i32> %a, splat (i32 137)
+  ret void
+}
+
+define <4 x i32> @ret_fixed_lenth_vector_splat_i32() {
+; CHECK: ret <4 x i32> splat (i32 56)
+  ret <4 x i32> splat (i32 56)
+}
+
+define void @add_fixed_lenth_vector_splat_double(<vscale x 2 x double> %a) {
+; CHECK: %add = fadd <vscale x 2 x double> %a, splat (double 5.700000e+00)
+  %add = fadd <vscale x 2 x double> %a, splat (double 5.700000e+00)
+  ret void
+}
+
+define <vscale x 4 x i32> @ret_scalable_vector_splat_i32() {
+; CHECK: ret <vscale x 4 x i32> splat (i32 78)
+  ret <vscale x 4 x i32> splat (i32 78)
+}
+
+define <4 x i32> @canonical_constant_vector() {
+; CHECK: ret <4 x i32> splat (i32 7)
+  ret <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+}
+
+define <4 x i32> @canonical_fixed_lnegth_vector_zero() {
+; CHECK: ret <4 x i32> zeroinitializer
+  ret <4 x i32> zeroinitializer
+}
+
+define <vscale x 4 x i32> @canonical_scalable_lnegth_vector_zero() {
+; CHECK: ret <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> zeroinitializer
+}
diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
new file mode 100644
index 0000000..bfe9ab8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=-fp-armv8 -o - %s | FileCheck %s
+
+define half @f2h(float %a) {
+; CHECK-LABEL: f2h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __gnu_f2h_ieee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = fptrunc float %a to half
+  ret half %0
+}
+
+define bfloat @f2bfloat(float %a) {
+; CHECK-LABEL: f2bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __truncsfbf2
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = fptrunc float %a to bfloat
+  ret bfloat %0
+}
+
diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll
index 1b86fe6..20215fe9 100644
--- a/llvm/test/CodeGen/AArch64/addsub.ll
+++ b/llvm/test/CodeGen/AArch64/addsub.ll
@@ -662,17 +662,13 @@ define dso_local i32 @_extract_crng_crng() {
 ; CHECK-NEXT:    cmn x8, #1272
 ; CHECK-NEXT:    b.pl .LBB36_3
 ; CHECK-NEXT:  .LBB36_2: // %if.then
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    adrp x8, primary_crng
 ; CHECK-NEXT:    ldr w8, [x8, :lo12:primary_crng]
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    adrp x8, input_pool
 ; CHECK-NEXT:    add x8, x8, :lo12:input_pool
 ; CHECK-NEXT:    csel x0, xzr, x8, eq
-; CHECK-NEXT:    bl crng_reseed
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    b crng_reseed
 ; CHECK-NEXT:  .LBB36_3: // %if.end
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
index 00ae34b..217f08b 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
@@ -2,7 +2,8 @@
 
 ; Validates when local linkage functions get a thunk generated.
 
-; Being called does not cause a thunk to be generated.
+; Being called does not cause a thunk to be generated or the symbol name to be mangled.
+; CHECK-NOT: "#does_not_have_addr_taken":
 ; CHECK-NOT:  $ientry_thunk$cdecl$v$f;
 define internal void @does_not_have_addr_taken(float) nounwind {
   ret void
@@ -12,7 +13,8 @@ define void @calls_does_not_have_addr_taken() nounwind {
   ret void
 }
 
-; Having an address taken does cause a thunk to be generated.
+; Having an address taken does cause a thunk to be generated and the symbol name to be mangled.
+; CHECK: "#has_addr_taken":
 ; CHECK: $ientry_thunk$cdecl$v$i8;
 define internal void @has_addr_taken(i64) nounwind {
   ret void
diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
index 94041bf..e601f03 100644
--- a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
+++ b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
@@ -40,7 +40,7 @@ declare dso_local i32 @g(...) local_unnamed_addr
 declare dso_local i32 @i(...) local_unnamed_addr
 
 ; CHECK-LABEL: <test2>:
-; CHECK:         bl {{.*}} <test2+0x18>
+; CHECK:         b {{.*}} <test2+0x1c>
 ; CHECK-LABEL: <$d.5>:
 ; CHECK-LABEL: <$x.6>:
 ; CHECK-NEXT:    b {{.*}} <test2+0x18>
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
index 3dba21d..aed3145 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
@@ -19,8 +19,8 @@
   ; CHECK-NEXT:    // implicit-def: $p4
   ; CHECK-NEXT:    addvl sp, sp, #1
   ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-  ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
   ; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+  ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
   ; CHECK-NEXT:    addvl sp, sp, #2
   ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
   ; CHECK-NEXT:    .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 213d791..f7920e5 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -772,9 +772,9 @@ body:             |
 
 # CHECK:      $sp  = frame-destroy ADDXri $sp, 32, 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
+# CHECK-NEXT: $z10  = frame-destroy LDR_ZXI $sp, 0
 # CHECK-NEXT: $z9  = frame-destroy LDR_ZXI $sp, 1
-# CHECK-NEXT: $z8  = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
 # CHECK-NEXT: $sp  = frame-destroy ADDVL_XXI $sp, 3
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -873,14 +873,14 @@ body:             |
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 1
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK:      $p15 = frame-destroy LDR_PXI $sp, 4
-# CHECK:      $p14 = frame-destroy LDR_PXI $sp, 5
-# CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
-# CHECK:      $p4 = frame-destroy LDR_PXI $sp, 15
 # CHECK:      $z23 = frame-destroy LDR_ZXI $sp, 2
 # CHECK:      $z22 = frame-destroy LDR_ZXI $sp, 3
 # CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
 # CHECK:      $z8 = frame-destroy LDR_ZXI $sp, 17
+# CHECK:      $p15 = frame-destroy LDR_PXI $sp, 4
+# CHECK:      $p14 = frame-destroy LDR_PXI $sp, 5
+# CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
+# CHECK:      $p4 = frame-destroy LDR_PXI $sp, 15
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 18
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -1037,14 +1037,14 @@ body:             |
 # CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]]
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -18
+# CHECK:      $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
 # CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5
 # CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
 # CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15
-# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2
-# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
-# CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
@@ -1198,10 +1198,10 @@ body:             |
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 7
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
-# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
 # CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1
 # CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
diff --git a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
deleted file mode 100644
index de4baec..0000000
--- a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
+++ /dev/null
@@ -1,101 +0,0 @@
-# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
-# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
-#
---- |
-
-  define void @foo() nounwind { entry: unreachable }
-
-  define void @bar() nounwind { entry: unreachable }
-
-  define void @baz() nounwind { entry: unreachable }
-
-...
----
-name:            foo
-# CHECK-LABEL: name: foo
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    $x19 = IMPLICIT_DEF
-    $x20 = IMPLICIT_DEF
-    $x21 = IMPLICIT_DEF
-    $x22 = IMPLICIT_DEF
-    $x23 = IMPLICIT_DEF
-    $x24 = IMPLICIT_DEF
-    $x25 = IMPLICIT_DEF
-    $x26 = IMPLICIT_DEF
-
-  ; The local stack size is 0, so the last ldp in the sequence will also
-  ; restore the stack.
-  ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 2
-  ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4
-  ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6
-
-  ; The ldp and the stack increment get merged even before
-  ; the load-store optimizer.
-  ; CHECK-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
-
-    RET_ReallyLR
-...
----
-name:            bar
-# CHECK-LABEL: name: bar
-tracksRegLiveness: true
-stack:
-  - { id : 0, size: 8, alignment: 4,
-  stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-  local-offset: -4, debug-info-variable: '', debug-info-expression: '',
-  debug-info-location: '' }
-
-body:             |
-  bb.0:
-    $x19 = IMPLICIT_DEF
-    $x20 = IMPLICIT_DEF
-    $x21 = IMPLICIT_DEF
-    $x22 = IMPLICIT_DEF
-    $x23 = IMPLICIT_DEF
-    $x24 = IMPLICIT_DEF
-    $x25 = IMPLICIT_DEF
-    $x26 = IMPLICIT_DEF
-
-  ; The local stack size is not 0, and we can combine the CSR stack size with
-  ; the local stack size. This results in rewriting the offsets for all the
-  ; save/restores and forbids us to merge the stack adjustment and the last pop.
-  ; In this case, there is no point of moving the first CSR pair at the end.
-  ; We do it anyway, as it's a small price to pay for the resulting
-  ; simplification in the epilogue emission code.
-  ; CHECK:      $x24, $x23 = frame-destroy LDPXi $sp, 4
-  ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6
-  ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8
-  ; CHECK-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 2
-  ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
-    RET_ReallyLR
-...
----
-# Check that the load from the offset 0 is moved at the end even when hasFP is
-# false.
-name:            baz
-# CHECK-LABEL: name: baz
-alignment:       4
-tracksRegLiveness: true
-frameInfo:
-  adjustsStack:    true
-  hasCalls:        true
-body:             |
-  bb.0:
-    successors: %bb.1
-
-    $x0 = IMPLICIT_DEF
-    $x20 = IMPLICIT_DEF
-    $x21 = IMPLICIT_DEF
-
-    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
-    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $x0
-    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-    B %bb.1
-
-  bb.1:
-   ; CHECK: $x21, $x20 = frame-destroy LDPXi $sp, 2
-   ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32
-    RET_ReallyLR
-...
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 296f2be..6d2abf7 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -226,30 +226,30 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fadd z0.d, z1.d, z0.d
 ; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -318,30 +318,30 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 86918a59..de676ac 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -187,30 +187,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -267,30 +267,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index b7119fc..ea7808d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -129,7 +129,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -145,6 +144,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -284,7 +284,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -300,6 +299,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -440,7 +440,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -456,6 +455,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -595,7 +595,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -611,6 +610,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -751,7 +751,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -767,6 +766,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -906,7 +906,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -922,6 +921,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1062,7 +1062,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1078,6 +1077,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1217,7 +1217,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1233,6 +1232,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1380,7 +1380,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1395,6 +1394,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1545,7 +1545,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1560,6 +1559,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1711,7 +1711,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1726,6 +1725,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1877,7 +1877,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1892,6 +1891,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2043,7 +2043,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2058,6 +2057,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2209,7 +2209,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2224,6 +2223,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2375,7 +2375,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2390,6 +2389,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2541,7 +2541,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2556,6 +2555,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 1fb251a..7e2d28f 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -82,7 +82,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -98,6 +97,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -190,7 +190,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -206,6 +205,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -299,7 +299,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -315,6 +314,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -407,7 +407,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -423,6 +422,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -516,7 +516,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -532,6 +531,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -624,7 +624,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -640,6 +639,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -733,7 +733,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -749,6 +748,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -841,7 +841,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -857,6 +856,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -955,7 +955,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -970,6 +969,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1071,7 +1071,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1086,6 +1085,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1188,7 +1188,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1203,6 +1202,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1304,7 +1304,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1319,6 +1318,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1421,7 +1421,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1436,6 +1435,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1537,7 +1537,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1552,6 +1551,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1654,7 +1654,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1669,6 +1668,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1770,7 +1770,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1785,6 +1784,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 1ad7870..56d865e 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -380,7 +380,6 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -397,6 +396,7 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #17
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -697,10 +697,10 @@ define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
-; CHECK-NEXT:    ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
index 37186cf..9fa5208 100644
--- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
@@ -70,22 +70,20 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
 ; NOFP16-NEXT:    .cfi_offset w22, -32
 ; NOFP16-NEXT:    .cfi_offset w30, -48
 ; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w2, #0xffff
+; NOFP16-NEXT:    and w0, w1, #0xffff
 ; NOFP16-NEXT:    mov x19, x3
-; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    mov w20, w2
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
 ; NOFP16-NEXT:    mov w22, w0
 ; NOFP16-NEXT:    and w0, w21, #0xffff
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w8, w0
 ; NOFP16-NEXT:    and w0, w20, #0xffff
+; NOFP16-NEXT:    orr x21, x8, x22, lsl #32
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w8, w21
-; NOFP16-NEXT:    // kill: def $w0 killed $w0 def $x0
-; NOFP16-NEXT:    str w22, [x19, #8]
-; NOFP16-NEXT:    orr x8, x8, x0, lsl #32
+; NOFP16-NEXT:    str x21, [x19]
 ; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    str x8, [x19]
+; NOFP16-NEXT:    str w0, [x19, #8]
 ; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; NOFP16-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; NOFP16-NEXT:    ret
@@ -133,26 +131,107 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
   ret void
 }
 
-; FIXME:
-; define half @f16_return(float %arg) #0 {
-;   %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret half %fptrunc
-; }
+ define half @f16_return(float %arg) #0 {
+; NOFP16-LABEL: f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
+; NOFP16-NEXT:    .cfi_offset w30, -16
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret half %fptrunc
+ }
 
-; define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
-;   %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret <2 x half> %fptrunc
-; }
+ define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
+; NOFP16-LABEL: v2f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 32
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w30, -32
+; NOFP16-NEXT:    mov w19, w0
+; NOFP16-NEXT:    mov w0, w1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    mov w0, w19
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w1, w20
+; NOFP16-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <2 x half> %fptrunc
+ }
 
-; define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
-;   %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret <3 x half> %fptrunc
-; }
+ define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
+; NOFP16-LABEL: v3f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 32
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w30, -32
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    mov w0, w2
+; NOFP16-NEXT:    mov w19, w1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w0, w19
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w19, w0
+; NOFP16-NEXT:    mov w0, w20
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w1, w19
+; NOFP16-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    mov w2, w21
+; NOFP16-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <3 x half> %fptrunc
+ }
 
-; define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
-;   %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret <4 x half> %fptrunc
-; }
+ define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
+; NOFP16-LABEL: v4f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w22, -32
+; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w0, w3
+; NOFP16-NEXT:    mov w19, w2
+; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w22, w0
+; NOFP16-NEXT:    mov w0, w19
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w19, w0
+; NOFP16-NEXT:    mov w0, w20
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    mov w0, w21
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w1, w20
+; NOFP16-NEXT:    mov w2, w19
+; NOFP16-NEXT:    mov w3, w22
+; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <4 x half> %fptrunc
+ }
 
 ; FIXME:
 ; define void @outgoing_f16_arg(ptr %ptr) #0 {
@@ -182,46 +261,17 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
 define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; NOFP16-LABEL: outgoing_v4f16_return:
 ; NOFP16:       // %bb.0:
-; NOFP16-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
 ; NOFP16-NEXT:    .cfi_offset w19, -8
-; NOFP16-NEXT:    .cfi_offset w20, -16
-; NOFP16-NEXT:    .cfi_offset w21, -24
-; NOFP16-NEXT:    .cfi_offset w22, -32
-; NOFP16-NEXT:    .cfi_offset w23, -40
-; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    .cfi_offset w30, -16
 ; NOFP16-NEXT:    mov x19, x0
 ; NOFP16-NEXT:    bl v4f16_result
-; NOFP16-NEXT:    and w0, w0, #0xffff
-; NOFP16-NEXT:    mov w20, w1
-; NOFP16-NEXT:    mov w21, w2
-; NOFP16-NEXT:    mov w22, w3
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w23, w0
-; NOFP16-NEXT:    and w0, w20, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w20, w0
-; NOFP16-NEXT:    and w0, w21, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w22, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #6]
-; NOFP16-NEXT:    mov w0, w21
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #4]
-; NOFP16-NEXT:    mov w0, w20
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #2]
-; NOFP16-NEXT:    mov w0, w23
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w2, [x19, #4]
+; NOFP16-NEXT:    strh w3, [x19, #6]
+; NOFP16-NEXT:    strh w1, [x19, #2]
 ; NOFP16-NEXT:    strh w0, [x19]
-; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; NOFP16-NEXT:    ret
   %val = call <4 x half> @v4f16_result()
   store <4 x half> %val, ptr %ptr
@@ -231,82 +281,21 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; NOFP16-LABEL: outgoing_v8f16_return:
 ; NOFP16:       // %bb.0:
-; NOFP16-NEXT:    stp x30, x27, [sp, #-80]! // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NOFP16-NEXT:    .cfi_def_cfa_offset 80
+; NOFP16-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
 ; NOFP16-NEXT:    .cfi_offset w19, -8
-; NOFP16-NEXT:    .cfi_offset w20, -16
-; NOFP16-NEXT:    .cfi_offset w21, -24
-; NOFP16-NEXT:    .cfi_offset w22, -32
-; NOFP16-NEXT:    .cfi_offset w23, -40
-; NOFP16-NEXT:    .cfi_offset w24, -48
-; NOFP16-NEXT:    .cfi_offset w25, -56
-; NOFP16-NEXT:    .cfi_offset w26, -64
-; NOFP16-NEXT:    .cfi_offset w27, -72
-; NOFP16-NEXT:    .cfi_offset w30, -80
+; NOFP16-NEXT:    .cfi_offset w30, -16
 ; NOFP16-NEXT:    mov x19, x0
 ; NOFP16-NEXT:    bl v8f16_result
-; NOFP16-NEXT:    and w0, w0, #0xffff
-; NOFP16-NEXT:    mov w21, w1
-; NOFP16-NEXT:    mov w22, w2
-; NOFP16-NEXT:    mov w23, w3
-; NOFP16-NEXT:    mov w24, w4
-; NOFP16-NEXT:    mov w25, w5
-; NOFP16-NEXT:    mov w26, w6
-; NOFP16-NEXT:    mov w27, w7
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w20, w0
-; NOFP16-NEXT:    and w0, w21, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w22, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w22, w0
-; NOFP16-NEXT:    and w0, w23, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w23, w0
-; NOFP16-NEXT:    and w0, w24, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w24, w0
-; NOFP16-NEXT:    and w0, w25, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w25, w0
-; NOFP16-NEXT:    and w0, w26, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w26, w0
-; NOFP16-NEXT:    and w0, w27, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #14]
-; NOFP16-NEXT:    mov w0, w26
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #12]
-; NOFP16-NEXT:    mov w0, w25
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #10]
-; NOFP16-NEXT:    mov w0, w24
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #8]
-; NOFP16-NEXT:    mov w0, w23
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #6]
-; NOFP16-NEXT:    mov w0, w22
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #4]
-; NOFP16-NEXT:    mov w0, w21
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #2]
-; NOFP16-NEXT:    mov w0, w20
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w5, [x19, #10]
+; NOFP16-NEXT:    strh w7, [x19, #14]
+; NOFP16-NEXT:    strh w6, [x19, #12]
+; NOFP16-NEXT:    strh w4, [x19, #8]
+; NOFP16-NEXT:    strh w3, [x19, #6]
+; NOFP16-NEXT:    strh w2, [x19, #4]
+; NOFP16-NEXT:    strh w1, [x19, #2]
 ; NOFP16-NEXT:    strh w0, [x19]
-; NOFP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x30, x27, [sp], #80 // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; NOFP16-NEXT:    ret
   %val = call <8 x half> @v8f16_result()
   store <8 x half> %val, ptr %ptr
diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll
index 47e49b8..d227538 100644
--- a/llvm/test/CodeGen/AArch64/sve-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll
@@ -66,30 +66,30 @@ define void @foo(<vscale x 4 x i64> %dst, i1 %cond) {
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:    addvl sp, x29, #-18
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 9851583..3965af6 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -567,30 +567,30 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -659,30 +659,30 @@ define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range()  {
 ; CHECK-NEXT:    fmov s7, #7.00000000
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
index f32c80d..4ddf007 100644
--- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll
+++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
@@ -83,30 +83,30 @@ define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -158,30 +158,30 @@ define i32 @sve_caller_non_sve_callee_fastcc(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
index 7cae1d2..a592dcd 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
@@ -4,12 +4,7 @@
 define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov	p0.b, z0
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 16 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8(<vscale x 16 x i8> %zn, i32 0)
@@ -19,27 +14,10 @@ define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
 define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov z8.d, z0.d
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT:    mov z0.d, z8.d
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    mov p4.b, p0.b
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue	p0.h
+; CHECK-NEXT:    pmov	p1.h, z0[0]
+; CHECK-NEXT:    pmov	p2.h, z0[1]
+; CHECK-NEXT:    eor	p0.b, p0/z, p1.b, p2.b
 ; CHECK-NEXT:    ret
   entry:
   %res1 = call <vscale x 8 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16(<vscale x 8 x i16> %zn, i32 0)
@@ -52,27 +30,10 @@ define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
 define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov z8.d, z0.d
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT:    mov z0.d, z8.d
-; CHECK-NEXT:    mov w0, #3 // =0x3
-; CHECK-NEXT:    mov p4.b, p0.b
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue	p0.s
+; CHECK-NEXT:    pmov	p1.s, z0[0]
+; CHECK-NEXT:    pmov	p2.s, z0[3]
+; CHECK-NEXT:    eor	p0.b, p0/z, p1.b, p2.b
 ; CHECK-NEXT:    ret
   entry:
   %res1 = call <vscale x 4 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32(<vscale x 4 x i32> %zn, i32 0)
@@ -85,27 +46,10 @@ define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
 define <vscale x 2 x i1> @test_pmov_to_pred_i64(<vscale x 2 x i64> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov z8.d, z0.d
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT:    mov z0.d, z8.d
-; CHECK-NEXT:    mov w0, #7 // =0x7
-; CHECK-NEXT:    mov p4.b, p0.b
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue	p0.d
+; CHECK-NEXT:    pmov	p1.d, z0[0]
+; CHECK-NEXT:    pmov	p2.d, z0[7]
+; CHECK-NEXT:    eor	p0.b, p0/z, p1.b, p2.b
 ; CHECK-NEXT:    ret
   entry:
   %res1 = call <vscale x 2 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64(<vscale x 2 x i64> %zn, i32 0)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
index 58b240b..b7f36c6 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
@@ -6,12 +6,7 @@
 define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[1], p0.h
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn, i32 1)
@@ -21,12 +16,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vsca
 define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #3 // =0x3
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[3], p0.s
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn, i32 3)
@@ -36,12 +26,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vsca
 define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #7 // =0x7
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[7], p0.d
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn, i32 7)
@@ -54,11 +39,7 @@ define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vsca
 define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0, p0.b
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8(<vscale x 16 x i1> %pn)
@@ -68,11 +49,7 @@ define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
 define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[0], p0.h
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16(<vscale x 8 x i1> %pn)
@@ -82,11 +59,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
 define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[0], p0.s
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32(<vscale x 4 x i1> %pn)
@@ -96,11 +69,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
 define <vscale x 2 x i64> @test_pmov_to_vector_zero_i64(<vscale x 2 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[0], p0.d
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64(<vscale x 2 x i1> %pn)
diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
index f3c4d217..822be14 100644
--- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll
+++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
@@ -63,18 +63,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -91,6 +79,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -112,18 +112,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -140,6 +128,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -215,18 +215,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #2
 ; GISEL-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -243,6 +231,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #18
 ; GISEL-NEXT:    .cfi_def_cfa wsp, 16
 ; GISEL-NEXT:    .cfi_restore z8
@@ -264,18 +264,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #2
 ; GISEL-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -292,6 +280,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #18
 ; GISEL-NEXT:    .cfi_def_cfa wsp, 16
 ; GISEL-NEXT:    .cfi_restore z8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir
new file mode 100644
index 0000000..aa72a9e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+# Tries to emit a foldable G_PTR_ADD with (p1, s32) operands.
+---
+name:            test_ptradd_crash__offset_smaller
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_ptradd_crash__offset_smaller
+    ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 12
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p1) :: (load (s32), addrspace 1)
+    ; CHECK-NEXT: $sgpr0 = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+    %1:_(p1) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 3
+    %0:_(<4 x s32>) = G_LOAD %1 :: (load (<4 x s32>) from `ptr addrspace(1) null`, addrspace 1)
+    %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %3
+    $sgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG implicit $sgpr0
+...
+
+# Tries to emit a foldable G_PTR_ADD with (p1, s128) operands.
+---
+name:            test_ptradd_crash__offset_wider
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_ptradd_crash__offset_wider
+    ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 12
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p1) :: (load (s32), addrspace 1)
+    ; CHECK-NEXT: $sgpr0 = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+    %1:_(p1) = G_CONSTANT i64 0
+    %3:_(s128) = G_CONSTANT i128 3
+    %0:_(<4 x s32>) = G_LOAD %1 :: (load (<4 x s32>) from `ptr addrspace(1) null`, addrspace 1)
+    %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %3
+    $sgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG implicit $sgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
new file mode 100644
index 0000000..6c5339e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -0,0 +1,584 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name:            rsq_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT [[INT]](s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_missing_contract0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_missing_contract0
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_missing_contract1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_missing_contract1
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FNEG [[INT]]
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16_missing_contract0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16_missing_contract0
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16_missing_contract1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16_missing_contract1
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
+    ; GCN-NEXT: %rsq:_(s16) = G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_multi_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...
+
+---
+name:            rsq_f16_multi_use_missing_contract0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use_missing_contract0
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...
+
+---
+name:            rsq_f16_multi_use_missing_contract1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use_missing_contract1
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...
+
+---
+name:            rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract G_FDIV %one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            neg_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %neg_one:_(s32) = G_FCONSTANT float -1.0
+    %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            afn_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract afn G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            afn_rsq_f32_multi_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_rsq_f32_multi_use
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ret:_(s32) = G_FSUB %sqrt, %rsq
+    ; GCN-NEXT: $vgpr0 = COPY %ret(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract afn G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    %ret:_(s32) = G_FSUB %sqrt, %rsq
+    $vgpr0 = COPY %ret
+
+...
+
+---
+name:            afn_neg_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_neg_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract afn G_FSQRT %x
+    %neg_one:_(s32) = G_FCONSTANT float -1.0
+    %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+
+---
+name:            rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract G_FSQRT %x
+    %one:_(s64) = G_FCONSTANT double 1.0
+    %rsq:_(s64) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract G_FSQRT %x
+    %neg_one:_(s64) = G_FCONSTANT double -1.0
+    %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            afn_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract afn G_FSQRT %x
+    %one:_(s64) = G_FCONSTANT double 1.0
+    %rsq:_(s64) = contract afn G_FDIV %one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            afn_neg_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_neg_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract afn G_FSQRT %x
+    %neg_one:_(s64) = G_FCONSTANT double -1.0
+    %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+
+---
+name:            rsq_fract_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_fract_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %fract:_(s16) = G_FCONSTANT half 0xH3800
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %fract
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %fract:_(s16) = G_FCONSTANT half 0.5
+    %rsq:_(s16) = contract G_FDIV %fract, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_fract_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_fract_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %neg_fract:_(s16) = G_FCONSTANT half 0xHB800
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_fract
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_fract:_(s16) = G_FCONSTANT half -0.5
+    %rsq:_(s16) = contract G_FDIV %neg_fract, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+
+...
+
+---
+name:            rsq_large_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_large_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %ten:_(s16) = G_FCONSTANT half 0xH4900
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %ten
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %ten:_(s16) = G_FCONSTANT half 10.0
+    %rsq:_(s16) = contract G_FDIV %ten, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_large_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_large_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %neg_ten:_(s16) = G_FCONSTANT half 0xHC900
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_ten
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_ten:_(s16) = G_FCONSTANT half -10.0
+    %rsq:_(s16) = contract G_FDIV %neg_ten, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 63a09e4..8ec7dfd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2182,9 +2182,8 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -2199,9 +2198,8 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
 ; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
@@ -2212,10 +2210,9 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
@@ -2226,10 +2223,9 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -2294,7 +2290,6 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
 ; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -2323,7 +2318,6 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9-NEXT:    v_and_or_b32 v5, v1, s8, v4
 ; GFX9-NEXT:    v_bfe_u32 v4, v4, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v5, s9
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
 ; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
@@ -2343,14 +2337,13 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s5
 ; GFX10-NEXT:    s_or_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
 ; GFX10-NEXT:    v_and_or_b32 v5, 0x80000000, v1, v4
 ; GFX10-NEXT:    v_bfe_u32 v4, v4, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2369,9 +2362,8 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
 ; GFX11-NEXT:    v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
@@ -2380,7 +2372,7 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    v_bfe_u32 v4, v4, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v5, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -8999,8 +8991,7 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -9014,9 +9005,8 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -9027,10 +9017,9 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -9042,11 +9031,10 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -9104,16 +9092,14 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -9126,20 +9112,18 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -9153,14 +9137,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -9176,16 +9159,15 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -9255,8 +9237,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -9268,16 +9249,14 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -9293,27 +9272,24 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -9331,18 +9307,17 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -9426,17 +9401,15 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -9447,16 +9420,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -9471,38 +9442,34 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -9523,31 +9490,30 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_add_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v4bf16:
@@ -9555,45 +9521,42 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -9717,17 +9680,15 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
@@ -9738,16 +9699,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
@@ -9758,16 +9717,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
@@ -9778,16 +9735,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -9806,74 +9761,66 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -9890,62 +9837,61 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
-; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v8, s4, 0x400000
-; GFX10-NEXT:    v_add_f32_e32 v9, v11, v9
-; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX10-NEXT:    v_add_f32_e32 v7, v10, v9
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v12, v9, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_add_f32_e32 v6, v10, v6
+; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_add_f32_e32 v6, v11, v6
-; GFX10-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_add_f32_e32 v5, v15, v13
-; GFX10-NEXT:    v_and_or_b32 v14, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v15, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v11, v5, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v0, s4, 0x400000
-; GFX10-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -9953,81 +9899,80 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v9, v11, v9
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v12, v9, s0, 0x400000
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_and_or_b32 v7, v8, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_add_f32_e32 v7, v10, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v14, v3, s0, 0x400000
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v11, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_add_f32 v2, v2, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v6, v10, v6
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v9, v6, s0, 0x400000
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
 ; GFX11-NEXT:    v_add_f32_e32 v5, v15, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v15, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v5, s0, 0x400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v13, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -10036,9 +9981,9 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -10263,16 +10208,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
@@ -10283,16 +10226,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
@@ -10303,16 +10244,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
@@ -10323,16 +10262,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
@@ -10343,16 +10280,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
@@ -10363,16 +10298,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
@@ -10383,16 +10316,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
@@ -10403,16 +10334,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -10439,146 +10368,130 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
 ; GFX9-NEXT:    v_add_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX9-NEXT:    v_add_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -10599,27 +10512,26 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v16, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
 ; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v16, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
@@ -10633,7 +10545,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
 ; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -10643,10 +10555,10 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v13, v19, v18
 ; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
@@ -10656,14 +10568,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v19, v13, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_add_f32_e32 v12, v18, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_and_or_b32 v22, v12, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
@@ -10675,12 +10587,12 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v17, v18, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -10689,8 +10601,8 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_and_or_b32 v18, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -10702,17 +10614,17 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_add_f32_e32 v9, v22, v20
-; GFX10-NEXT:    v_and_or_b32 v22, v19, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v24, v9, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v25, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v22, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
@@ -10741,12 +10653,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_or_b32 v20, v16, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v17, v18, v17
 ; GFX11-NEXT:    v_add_f32_e32 v6, v6, v14
@@ -10759,13 +10670,13 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v16, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
 ; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
@@ -10787,32 +10698,32 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_or_b32 v13, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18
 ; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v18, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-NEXT:    v_add_f32_e32 v12, v18, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_or_b32 v22, v12, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v13, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v21, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
@@ -10828,7 +10739,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v17, v18, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_add_f32_e32 v3, v3, v11
@@ -10838,13 +10749,13 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v20, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_or_b32 v18, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -10861,13 +10772,13 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_add_f32_e32 v9, v22, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v22, v19, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v25, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v24, v9, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v22, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
@@ -11434,16 +11345,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
-; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
@@ -11465,29 +11374,25 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_add_f32_e32 v30, v15, v30
 ; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v33
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v30
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
 ; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
@@ -11498,16 +11403,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v29
 ; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
@@ -11518,16 +11421,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
@@ -11538,16 +11439,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
@@ -11558,16 +11457,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
@@ -11578,16 +11475,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -11598,16 +11493,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -11618,16 +11511,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
@@ -11638,16 +11529,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
@@ -11658,16 +11547,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
@@ -11678,16 +11565,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
@@ -11698,16 +11583,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
@@ -11718,16 +11601,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
@@ -11738,16 +11619,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -11788,292 +11667,260 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_add_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_add_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v30
+; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_add_f32_e32 v32, v32, v29
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_add_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v34
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_add_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v32
-; GFX9-NEXT:    v_add3_u32 v15, v15, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v32, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX9-NEXT:    v_add3_u32 v32, v32, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
 ; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    v_add_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_add_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_add_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_add_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_add_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_add_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_add_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -12098,7 +11945,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-LABEL: v_fadd_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -12163,7 +12010,6 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v55, v11, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v65, v49, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT:    s_mov_b32 s23, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
@@ -12179,10 +12025,10 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
 ; GFX10-NEXT:    v_add_f32_e32 v17, v26, v50
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_and_or_b32 v54, v39, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v64, v11, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v66, v49, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v68, v10, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT:    v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT:    v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v39, v39
 ; GFX10-NEXT:    v_add3_u32 v39, v53, v39, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v11, v11
@@ -12220,28 +12066,28 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v27, v14, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v29, v35, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v48, v37, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v52, v12, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_add3_u32 v37, v38, v37, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v12, v50, v12, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v18, v18
 ; GFX10-NEXT:    v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v18, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v17, v17, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s22, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v26, v33, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v28, v14, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v30, v35, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v36, v13, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT:    v_or_b32_e32 v36, 0x400000, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
@@ -12260,12 +12106,12 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v66, v17, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v68, v0, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_or_b32 v27, v51, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v51
 ; GFX10-NEXT:    v_bfe_u32 v35, v9, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v67, v24, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v67, 0x400000, v24
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
 ; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v51, v7, 16, 1
@@ -12282,51 +12128,51 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v36, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v19, v19
 ; GFX10-NEXT:    v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v19, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v19
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT:    v_and_or_b32 v34, v9, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v50, v25, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v25
 ; GFX10-NEXT:    v_bfe_u32 v53, v8, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
 ; GFX10-NEXT:    v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v35, v7, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s15, v25, v25
 ; GFX10-NEXT:    v_add3_u32 v25, v38, v25, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v23, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v51, v6, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v51, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s20, v6, v6
 ; GFX10-NEXT:    v_add3_u32 v6, v65, v6, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v65, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
 ; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v21, v21, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v4, v4
 ; GFX10-NEXT:    v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v20, v20
 ; GFX10-NEXT:    v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v20, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v20
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v48, v19, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT:    v_and_or_b32 v55, v8, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v55, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v8, v8
 ; GFX10-NEXT:    v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v53, v23, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v53, 0x400000, v23
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v23, v23
 ; GFX10-NEXT:    v_add3_u32 v23, v38, v23, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v22, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v26, v21, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v30, v20, s6
@@ -12334,7 +12180,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s21, v22, v22
 ; GFX10-NEXT:    v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v22, v22, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v65, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
@@ -12358,14 +12204,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v13, v13, v29, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX10-NEXT:    v_add_f32_e32 v17, v32, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; GFX10-NEXT:    v_add_f32_e32 v17, v31, v17
 ; GFX10-NEXT:    v_add_f32_e32 v15, v15, v18
 ; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v17, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v15, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_add3_u32 v17, v18, v17, 0x7fff
@@ -12378,212 +12224,219 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-LABEL: v_fadd_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
 ; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_or_b32 v144, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
+; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_add_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
+; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT:    v_and_or_b32 v86, v24, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v96, v7, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83
+; GFX11-NEXT:    v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
+; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
+; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_add_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_add_f32_e32 v18, v84, v83
-; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_and_or_b32 v84, v8, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v98, v23, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v100, v6, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v112, v5, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v114, v21, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
+; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v20
 ; GFX11-NEXT:    v_add_f32_e32 v20, v80, v71
-; GFX11-NEXT:    v_dual_add_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_add_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_add_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_add_f32_e32 v28, v48, v39
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_add_f32 v27, v50, v49 :: v_dual_add_f32 v26, v52, v51
-; GFX11-NEXT:    v_dual_add_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-NEXT:    v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_add_f32_e32 v26, v52, v51
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-NEXT:    v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_add_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
-; GFX11-NEXT:    v_and_or_b32 v48, v13, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v36, v14, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v34, v33, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v38, v30, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v50, v29, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
+; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
 ; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v52, v12, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v54, v28, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_or_b32 v64, v11, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v66, v27, s0, 0x400000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
 ; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v68, v10, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v70, v26, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
+; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_or_b32 v80, v9, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v82, v25, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
+; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
+; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v102, v22, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v116, v4, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
+; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_and_or_b32 v118, v20, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v130, v19, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
+; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v134, v18, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
+; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v146, v17, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
 ; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v33, v0, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v132, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v128, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
@@ -12622,22 +12475,21 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v17, v32, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
 ; GFX11-NEXT:    v_add_f32_e32 v15, v15, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_or_b32 v21, v15, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -12678,8 +12530,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12692,9 +12543,8 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12704,10 +12554,9 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -12718,11 +12567,10 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -12761,8 +12609,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12775,9 +12622,8 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12787,10 +12633,9 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -12801,11 +12646,10 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -12849,8 +12693,7 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12864,9 +12707,8 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12877,10 +12719,9 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -12892,11 +12733,10 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -12954,16 +12794,14 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12976,20 +12814,18 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13003,14 +12839,13 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -13026,16 +12861,15 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -13105,8 +12939,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -13118,16 +12951,14 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13143,27 +12974,24 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13181,18 +13009,17 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -13276,17 +13103,15 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -13297,16 +13122,14 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -13321,38 +13144,34 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13373,31 +13192,30 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_sub_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v4bf16:
@@ -13405,45 +13223,42 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -13481,8 +13296,7 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13496,9 +13310,8 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13509,10 +13322,9 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -13524,11 +13336,10 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -13586,16 +13397,14 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13608,20 +13417,18 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13635,14 +13442,13 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -13658,16 +13464,15 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -13737,8 +13542,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -13750,16 +13554,14 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13775,27 +13577,24 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13813,18 +13612,17 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -13908,17 +13706,15 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -13929,16 +13725,14 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -13953,38 +13747,34 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -14005,31 +13795,30 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v4bf16:
@@ -14037,45 +13826,42 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_mul_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -14199,17 +13985,15 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
@@ -14220,16 +14004,14 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
@@ -14240,16 +14022,14 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
@@ -14260,16 +14040,14 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -14288,74 +14066,66 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -14372,62 +14142,61 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
-; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v8, s4, 0x400000
-; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v9
-; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX10-NEXT:    v_mul_f32_e32 v7, v10, v9
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v12, v9, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_mul_f32_e32 v6, v10, v6
+; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_mul_f32_e32 v6, v11, v6
-; GFX10-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v5, v15, v13
-; GFX10-NEXT:    v_and_or_b32 v14, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v15, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v11, v5, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v0, s4, 0x400000
-; GFX10-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -14435,81 +14204,80 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v9, v11, v9
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v12, v9, s0, 0x400000
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_and_or_b32 v7, v8, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_mul_f32_e32 v7, v10, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v14, v3, s0, 0x400000
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v11, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_mul_f32 v2, v2, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v10, v6
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v9, v6, s0, 0x400000
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
 ; GFX11-NEXT:    v_mul_f32_e32 v5, v15, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v15, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v5, s0, 0x400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v13, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -14518,9 +14286,9 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -14745,16 +14513,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
@@ -14765,16 +14531,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
@@ -14785,16 +14549,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
@@ -14805,16 +14567,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
@@ -14825,16 +14585,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
@@ -14845,16 +14603,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
@@ -14865,16 +14621,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
@@ -14885,16 +14639,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -14921,146 +14673,130 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
 ; GFX9-NEXT:    v_mul_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -15081,27 +14817,26 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v16, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v16, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
@@ -15115,7 +14850,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
 ; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -15125,10 +14860,10 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v13, v19, v18
 ; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
@@ -15138,14 +14873,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v19, v13, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v12, v18, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_and_or_b32 v22, v12, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
@@ -15157,12 +14892,12 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v17, v18, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -15171,8 +14906,8 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_and_or_b32 v18, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -15184,17 +14919,17 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_mul_f32_e32 v9, v22, v20
-; GFX10-NEXT:    v_and_or_b32 v22, v19, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v24, v9, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v25, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v22, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
@@ -15223,12 +14958,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_or_b32 v20, v16, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v17, v18, v17
 ; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v14
@@ -15241,13 +14975,13 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v16, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
 ; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
@@ -15269,32 +15003,32 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_or_b32 v13, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18
 ; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v18, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-NEXT:    v_mul_f32_e32 v12, v18, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_or_b32 v22, v12, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v13, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v21, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
@@ -15310,7 +15044,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v17, v18, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v11
@@ -15320,13 +15054,13 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v20, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_or_b32 v18, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -15343,13 +15077,13 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_mul_f32_e32 v9, v22, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v22, v19, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v25, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v24, v9, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v22, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
@@ -15916,16 +15650,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
-; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
@@ -15947,29 +15679,25 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_mul_f32_e32 v30, v15, v30
 ; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v33
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v30
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
 ; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
@@ -15980,16 +15708,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v29
 ; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
@@ -16000,16 +15726,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
@@ -16020,16 +15744,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
@@ -16040,16 +15762,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
@@ -16060,16 +15780,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -16080,16 +15798,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -16100,16 +15816,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
@@ -16120,16 +15834,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
@@ -16140,16 +15852,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
@@ -16160,16 +15870,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
@@ -16180,16 +15888,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
@@ -16200,16 +15906,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
@@ -16220,16 +15924,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -16270,292 +15972,260 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_mul_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_mul_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v30
+; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v29
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v34
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_mul_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v32
-; GFX9-NEXT:    v_add3_u32 v15, v15, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v32, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX9-NEXT:    v_add3_u32 v32, v32, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_mul_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
 ; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    v_mul_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_mul_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_mul_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_mul_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_mul_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_mul_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -16580,7 +16250,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -16645,7 +16315,6 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v55, v11, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v65, v49, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT:    s_mov_b32 s23, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
@@ -16661,10 +16330,10 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v17, v26, v50
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_and_or_b32 v54, v39, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v64, v11, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v66, v49, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v68, v10, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT:    v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT:    v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v39, v39
 ; GFX10-NEXT:    v_add3_u32 v39, v53, v39, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v11, v11
@@ -16702,28 +16371,28 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v27, v14, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v29, v35, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v48, v37, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v52, v12, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_add3_u32 v37, v38, v37, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v12, v50, v12, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v18, v18
 ; GFX10-NEXT:    v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v18, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v17, v17, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s22, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v26, v33, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v28, v14, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v30, v35, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v36, v13, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT:    v_or_b32_e32 v36, 0x400000, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
@@ -16742,12 +16411,12 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v66, v17, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v68, v0, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_or_b32 v27, v51, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v51
 ; GFX10-NEXT:    v_bfe_u32 v35, v9, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v67, v24, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v67, 0x400000, v24
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
 ; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v51, v7, 16, 1
@@ -16764,51 +16433,51 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v36, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v19, v19
 ; GFX10-NEXT:    v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v19, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v19
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT:    v_and_or_b32 v34, v9, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v50, v25, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v25
 ; GFX10-NEXT:    v_bfe_u32 v53, v8, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
 ; GFX10-NEXT:    v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v35, v7, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s15, v25, v25
 ; GFX10-NEXT:    v_add3_u32 v25, v38, v25, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v23, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v51, v6, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v51, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s20, v6, v6
 ; GFX10-NEXT:    v_add3_u32 v6, v65, v6, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v65, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
 ; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v21, v21, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v4, v4
 ; GFX10-NEXT:    v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v20, v20
 ; GFX10-NEXT:    v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v20, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v20
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v48, v19, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT:    v_and_or_b32 v55, v8, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v55, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v8, v8
 ; GFX10-NEXT:    v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v53, v23, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v53, 0x400000, v23
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v23, v23
 ; GFX10-NEXT:    v_add3_u32 v23, v38, v23, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v22, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v26, v21, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v30, v20, s6
@@ -16816,7 +16485,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s21, v22, v22
 ; GFX10-NEXT:    v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v22, v22, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v65, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
@@ -16840,14 +16509,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v13, v13, v29, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX10-NEXT:    v_mul_f32_e32 v17, v32, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; GFX10-NEXT:    v_mul_f32_e32 v17, v31, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v18
 ; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v17, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v15, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_add3_u32 v17, v18, v17, 0x7fff
@@ -16860,212 +16529,219 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-LABEL: v_fmul_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
 ; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_or_b32 v144, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
+; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_mul_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_mul_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
+; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT:    v_and_or_b32 v86, v24, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v96, v7, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83
+; GFX11-NEXT:    v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
+; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
+; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_mul_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_mul_f32_e32 v18, v84, v83
-; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_and_or_b32 v84, v8, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v98, v23, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v100, v6, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v112, v5, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v114, v21, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
+; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX11-NEXT:    v_mul_f32_e32 v20, v80, v71
-; GFX11-NEXT:    v_dual_mul_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_mul_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_mul_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_mul_f32_e32 v28, v48, v39
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_mul_f32 v27, v50, v49 :: v_dual_mul_f32 v26, v52, v51
-; GFX11-NEXT:    v_dual_mul_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-NEXT:    v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_mul_f32_e32 v26, v52, v51
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-NEXT:    v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_mul_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
-; GFX11-NEXT:    v_and_or_b32 v48, v13, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v36, v14, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v34, v33, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v38, v30, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v50, v29, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
+; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
 ; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v52, v12, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v54, v28, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_or_b32 v64, v11, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v66, v27, s0, 0x400000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
 ; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v68, v10, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v70, v26, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
+; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_or_b32 v80, v9, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v82, v25, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
+; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
+; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v102, v22, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v116, v4, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
+; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_and_or_b32 v118, v20, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v130, v19, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
+; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v134, v18, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
+; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v146, v17, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
 ; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v33, v0, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v132, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v128, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
@@ -17104,22 +16780,21 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v17, v32, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
 ; GFX11-NEXT:    v_mul_f32_e32 v15, v15, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_or_b32 v21, v15, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -17194,9 +16869,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17220,9 +16894,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17235,7 +16908,6 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
@@ -17246,7 +16918,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -17258,7 +16930,6 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -17277,7 +16948,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -17637,8 +17308,7 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17652,9 +17322,8 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17665,10 +17334,9 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -17680,11 +17348,10 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -17750,16 +17417,14 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17772,20 +17437,18 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -17799,14 +17462,13 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -17822,16 +17484,15 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -17913,8 +17574,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -17926,16 +17586,14 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17951,27 +17609,24 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -17989,18 +17644,17 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -18100,17 +17754,15 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -18121,16 +17773,14 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -18145,38 +17795,34 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -18197,31 +17843,30 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_min_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v4bf16:
@@ -18229,45 +17874,42 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -18423,17 +18065,15 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
@@ -18444,16 +18084,14 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
@@ -18464,16 +18102,14 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
@@ -18484,16 +18120,14 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -18512,74 +18146,66 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX9-NEXT:    v_min_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_min_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -18596,62 +18222,61 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
-; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v8, s4, 0x400000
-; GFX10-NEXT:    v_min_f32_e32 v9, v11, v9
-; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX10-NEXT:    v_min_f32_e32 v7, v10, v9
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v12, v9, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_min_f32_e32 v6, v10, v6
+; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_min_f32_e32 v6, v11, v6
-; GFX10-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_min_f32_e32 v5, v15, v13
-; GFX10-NEXT:    v_and_or_b32 v14, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v15, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v11, v5, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v0, s4, 0x400000
-; GFX10-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -18659,81 +18284,80 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v9, v11, v9
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v12, v9, s0, 0x400000
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_and_or_b32 v7, v8, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_min_f32_e32 v7, v10, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v14, v3, s0, 0x400000
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v11, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_min_f32 v2, v2, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v6, v10, v6
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v9, v6, s0, 0x400000
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
 ; GFX11-NEXT:    v_min_f32_e32 v5, v15, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v15, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v5, s0, 0x400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v13, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -18742,9 +18366,9 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -19033,16 +18657,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
@@ -19053,16 +18675,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
@@ -19073,16 +18693,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
@@ -19093,16 +18711,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
@@ -19113,16 +18729,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
@@ -19133,16 +18747,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
@@ -19153,16 +18765,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
@@ -19173,16 +18783,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -19209,146 +18817,130 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
 ; GFX9-NEXT:    v_min_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
 ; GFX9-NEXT:    v_min_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX9-NEXT:    v_min_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX9-NEXT:    v_min_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX9-NEXT:    v_min_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX9-NEXT:    v_min_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -19369,27 +18961,26 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v16, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
 ; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v16, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
@@ -19403,7 +18994,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
 ; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -19413,10 +19004,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
 ; GFX10-NEXT:    v_min_f32_e32 v13, v19, v18
 ; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
@@ -19426,14 +19017,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v19, v13, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_min_f32_e32 v12, v18, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
 ; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_and_or_b32 v22, v12, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
@@ -19445,12 +19036,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v17, v18, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -19459,8 +19050,8 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_and_or_b32 v18, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -19472,17 +19063,17 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_min_f32_e32 v9, v22, v20
-; GFX10-NEXT:    v_and_or_b32 v22, v19, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v24, v9, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v25, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v22, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
@@ -19511,12 +19102,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_or_b32 v20, v16, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_f32_e32 v17, v18, v17
 ; GFX11-NEXT:    v_min_f32_e32 v6, v6, v14
@@ -19529,13 +19119,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_min_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v16, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
 ; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
@@ -19557,32 +19147,32 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_or_b32 v13, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18
 ; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v18, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-NEXT:    v_min_f32_e32 v12, v18, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_or_b32 v22, v12, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v13, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v21, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
@@ -19598,7 +19188,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v17, v18, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_min_f32_e32 v3, v3, v11
@@ -19608,13 +19198,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v20, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_or_b32 v18, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -19631,13 +19221,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_min_f32_e32 v9, v22, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v22, v19, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v25, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v24, v9, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v22, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
@@ -20332,16 +19922,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX8-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
-; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
@@ -20363,29 +19951,25 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_min_f32_e32 v30, v15, v30
 ; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v33
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v30
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
 ; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
@@ -20396,16 +19980,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v29
 ; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
@@ -20416,16 +19998,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
@@ -20436,16 +20016,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
@@ -20456,16 +20034,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
@@ -20476,16 +20052,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -20496,16 +20070,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -20516,16 +20088,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
@@ -20536,16 +20106,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
@@ -20556,16 +20124,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
@@ -20576,16 +20142,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
@@ -20596,16 +20160,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
@@ -20616,16 +20178,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
@@ -20636,16 +20196,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -20686,292 +20244,260 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_min_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_min_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v30
+; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_min_f32_e32 v32, v32, v29
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_min_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v34
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_min_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v32
-; GFX9-NEXT:    v_add3_u32 v15, v15, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v32, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX9-NEXT:    v_add3_u32 v32, v32, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_min_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
 ; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    v_min_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_min_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_min_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_min_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_min_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_min_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_min_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_min_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_min_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_min_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_min_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -20996,7 +20522,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -21061,7 +20587,6 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v55, v11, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v65, v49, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT:    s_mov_b32 s23, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
@@ -21077,10 +20602,10 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GFX10-NEXT:    v_min_f32_e32 v17, v26, v50
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_and_or_b32 v54, v39, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v64, v11, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v66, v49, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v68, v10, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT:    v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT:    v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v39, v39
 ; GFX10-NEXT:    v_add3_u32 v39, v53, v39, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v11, v11
@@ -21118,28 +20643,28 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v27, v14, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v29, v35, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v48, v37, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v52, v12, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_add3_u32 v37, v38, v37, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v12, v50, v12, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v18, v18
 ; GFX10-NEXT:    v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v18, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v17, v17, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s22, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v26, v33, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v28, v14, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v30, v35, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v36, v13, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT:    v_or_b32_e32 v36, 0x400000, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
@@ -21158,12 +20683,12 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v66, v17, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v68, v0, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_or_b32 v27, v51, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v51
 ; GFX10-NEXT:    v_bfe_u32 v35, v9, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v67, v24, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v67, 0x400000, v24
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
 ; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v51, v7, 16, 1
@@ -21180,51 +20705,51 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v36, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v19, v19
 ; GFX10-NEXT:    v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v19, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v19
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT:    v_and_or_b32 v34, v9, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v50, v25, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v25
 ; GFX10-NEXT:    v_bfe_u32 v53, v8, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
 ; GFX10-NEXT:    v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v35, v7, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s15, v25, v25
 ; GFX10-NEXT:    v_add3_u32 v25, v38, v25, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v23, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v51, v6, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v51, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s20, v6, v6
 ; GFX10-NEXT:    v_add3_u32 v6, v65, v6, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v65, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
 ; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v21, v21, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v4, v4
 ; GFX10-NEXT:    v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v20, v20
 ; GFX10-NEXT:    v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v20, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v20
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v48, v19, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT:    v_and_or_b32 v55, v8, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v55, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v8, v8
 ; GFX10-NEXT:    v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v53, v23, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v53, 0x400000, v23
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v23, v23
 ; GFX10-NEXT:    v_add3_u32 v23, v38, v23, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v22, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v26, v21, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v30, v20, s6
@@ -21232,7 +20757,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s21, v22, v22
 ; GFX10-NEXT:    v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v22, v22, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v65, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
@@ -21256,14 +20781,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v13, v13, v29, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX10-NEXT:    v_min_f32_e32 v17, v32, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; GFX10-NEXT:    v_min_f32_e32 v17, v31, v17
 ; GFX10-NEXT:    v_min_f32_e32 v15, v15, v18
 ; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v17, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v15, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_add3_u32 v17, v18, v17, 0x7fff
@@ -21276,212 +20801,219 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-LABEL: v_minnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
 ; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_or_b32 v144, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
+; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_min_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
+; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT:    v_and_or_b32 v86, v24, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v96, v7, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83
+; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
+; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
+; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_min_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_min_f32_e32 v18, v84, v83
-; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_and_or_b32 v84, v8, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v98, v23, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v100, v6, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v112, v5, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v114, v21, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX11-NEXT:    v_dual_min_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
+; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v4, v4, v20
 ; GFX11-NEXT:    v_min_f32_e32 v20, v80, v71
-; GFX11-NEXT:    v_dual_min_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_min_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_min_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_min_f32_e32 v28, v48, v39
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_min_f32 v27, v50, v49 :: v_dual_min_f32 v26, v52, v51
-; GFX11-NEXT:    v_dual_min_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-NEXT:    v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_min_f32_e32 v26, v52, v51
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-NEXT:    v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_min_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
-; GFX11-NEXT:    v_and_or_b32 v48, v13, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v36, v14, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v34, v33, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v38, v30, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v50, v29, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
+; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
 ; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v52, v12, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v54, v28, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_or_b32 v64, v11, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v66, v27, s0, 0x400000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
 ; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v68, v10, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v70, v26, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
+; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_or_b32 v80, v9, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v82, v25, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
+; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
+; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v102, v22, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v116, v4, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
+; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_and_or_b32 v118, v20, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v130, v19, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
+; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v134, v18, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
+; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v146, v17, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
 ; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v33, v0, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v132, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v128, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
@@ -21520,22 +21052,21 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v17, v32, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
 ; GFX11-NEXT:    v_min_f32_e32 v15, v15, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_or_b32 v21, v15, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -21594,8 +21125,7 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -21609,9 +21139,8 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -21622,10 +21151,9 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -21637,11 +21165,10 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -21707,16 +21234,14 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -21729,20 +21254,18 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -21756,14 +21279,13 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -21779,16 +21301,15 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -21870,8 +21391,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -21883,16 +21403,14 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -21908,27 +21426,24 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -21946,18 +21461,17 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -22057,17 +21571,15 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -22078,16 +21590,14 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -22102,38 +21612,34 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -22154,31 +21660,30 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_max_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v4bf16:
@@ -22186,45 +21691,42 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -22380,17 +21882,15 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
@@ -22401,16 +21901,14 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
@@ -22421,16 +21919,14 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
@@ -22441,16 +21937,14 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -22469,74 +21963,66 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX9-NEXT:    v_max_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_max_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -22553,62 +22039,61 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
-; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v8, s4, 0x400000
-; GFX10-NEXT:    v_max_f32_e32 v9, v11, v9
-; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX10-NEXT:    v_max_f32_e32 v7, v10, v9
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v12, v9, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v6, v10, v6
+; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v6, v11, v6
-; GFX10-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_max_f32_e32 v5, v15, v13
-; GFX10-NEXT:    v_and_or_b32 v14, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v15, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v11, v5, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v0, s4, 0x400000
-; GFX10-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -22616,81 +22101,80 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v9, v11, v9
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v12, v9, s0, 0x400000
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_and_or_b32 v7, v8, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_max_f32_e32 v7, v10, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v14, v3, s0, 0x400000
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v11, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_max_f32 v2, v2, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v6, v10, v6
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v9, v6, s0, 0x400000
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
 ; GFX11-NEXT:    v_max_f32_e32 v5, v15, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v15, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v5, s0, 0x400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v13, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -22699,9 +22183,9 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -22990,16 +22474,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
@@ -23010,16 +22492,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
@@ -23030,16 +22510,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
@@ -23050,16 +22528,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
@@ -23070,16 +22546,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
@@ -23090,16 +22564,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
@@ -23110,16 +22582,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
@@ -23130,16 +22600,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -23166,146 +22634,130 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
 ; GFX9-NEXT:    v_max_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
 ; GFX9-NEXT:    v_max_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX9-NEXT:    v_max_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX9-NEXT:    v_max_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX9-NEXT:    v_max_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX9-NEXT:    v_max_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -23326,27 +22778,26 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v16, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
 ; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v16, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
@@ -23360,7 +22811,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
 ; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -23370,10 +22821,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
 ; GFX10-NEXT:    v_max_f32_e32 v13, v19, v18
 ; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
@@ -23383,14 +22834,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v19, v13, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_max_f32_e32 v12, v18, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_and_or_b32 v22, v12, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
@@ -23402,12 +22853,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v17, v18, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -23416,8 +22867,8 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_and_or_b32 v18, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -23429,17 +22880,17 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_max_f32_e32 v9, v22, v20
-; GFX10-NEXT:    v_and_or_b32 v22, v19, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v24, v9, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v25, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v22, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
@@ -23468,12 +22919,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_or_b32 v20, v16, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v17, v18, v17
 ; GFX11-NEXT:    v_max_f32_e32 v6, v6, v14
@@ -23486,13 +22936,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v16, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
 ; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
@@ -23514,32 +22964,32 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_or_b32 v13, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18
 ; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v18, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-NEXT:    v_max_f32_e32 v12, v18, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_or_b32 v22, v12, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v13, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v21, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
@@ -23555,7 +23005,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v17, v18, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_max_f32_e32 v3, v3, v11
@@ -23565,13 +23015,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v20, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_or_b32 v18, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -23588,13 +23038,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_max_f32_e32 v9, v22, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v22, v19, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v25, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v24, v9, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v22, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
@@ -24289,16 +23739,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX8-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
-; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
@@ -24320,29 +23768,25 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_max_f32_e32 v30, v15, v30
 ; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v33
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v30
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
 ; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
@@ -24353,16 +23797,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v29
 ; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
@@ -24373,16 +23815,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
@@ -24393,16 +23833,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
@@ -24413,16 +23851,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
@@ -24433,16 +23869,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -24453,16 +23887,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -24473,16 +23905,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
@@ -24493,16 +23923,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
@@ -24513,16 +23941,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
@@ -24533,16 +23959,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
@@ -24553,16 +23977,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
@@ -24573,16 +23995,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
@@ -24593,16 +24013,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -24643,292 +24061,260 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_max_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_max_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v30
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v29
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_max_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v34
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_max_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v32
-; GFX9-NEXT:    v_add3_u32 v15, v15, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v32, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX9-NEXT:    v_add3_u32 v32, v32, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_max_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
 ; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    v_max_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_max_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_max_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_max_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_max_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_max_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_max_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_max_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_max_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_max_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_max_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -24953,7 +24339,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-LABEL: v_maxnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -25018,7 +24404,6 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v55, v11, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v65, v49, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT:    s_mov_b32 s23, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
@@ -25034,10 +24419,10 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GFX10-NEXT:    v_max_f32_e32 v17, v26, v50
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_and_or_b32 v54, v39, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v64, v11, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v66, v49, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v68, v10, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT:    v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT:    v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v39, v39
 ; GFX10-NEXT:    v_add3_u32 v39, v53, v39, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v11, v11
@@ -25075,28 +24460,28 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v27, v14, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v29, v35, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v48, v37, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v52, v12, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_add3_u32 v37, v38, v37, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v12, v50, v12, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v18, v18
 ; GFX10-NEXT:    v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v18, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v17, v17, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s22, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v26, v33, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v28, v14, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v30, v35, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v36, v13, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT:    v_or_b32_e32 v36, 0x400000, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
@@ -25115,12 +24500,12 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v66, v17, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v68, v0, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_or_b32 v27, v51, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v51
 ; GFX10-NEXT:    v_bfe_u32 v35, v9, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v67, v24, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v67, 0x400000, v24
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
 ; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v51, v7, 16, 1
@@ -25137,51 +24522,51 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v36, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v19, v19
 ; GFX10-NEXT:    v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v19, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v19
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT:    v_and_or_b32 v34, v9, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v50, v25, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v25
 ; GFX10-NEXT:    v_bfe_u32 v53, v8, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
 ; GFX10-NEXT:    v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v35, v7, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s15, v25, v25
 ; GFX10-NEXT:    v_add3_u32 v25, v38, v25, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v23, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v51, v6, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v51, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s20, v6, v6
 ; GFX10-NEXT:    v_add3_u32 v6, v65, v6, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v65, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
 ; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v21, v21, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v4, v4
 ; GFX10-NEXT:    v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v20, v20
 ; GFX10-NEXT:    v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v20, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v20
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v48, v19, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT:    v_and_or_b32 v55, v8, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v55, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v8, v8
 ; GFX10-NEXT:    v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v53, v23, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v53, 0x400000, v23
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v23, v23
 ; GFX10-NEXT:    v_add3_u32 v23, v38, v23, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v22, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v26, v21, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v30, v20, s6
@@ -25189,7 +24574,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s21, v22, v22
 ; GFX10-NEXT:    v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v22, v22, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v65, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
@@ -25213,14 +24598,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v13, v13, v29, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX10-NEXT:    v_max_f32_e32 v17, v32, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; GFX10-NEXT:    v_max_f32_e32 v17, v31, v17
 ; GFX10-NEXT:    v_max_f32_e32 v15, v15, v18
 ; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v17, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v15, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_add3_u32 v17, v18, v17, 0x7fff
@@ -25233,212 +24618,219 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-LABEL: v_maxnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
 ; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_or_b32 v144, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
+; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_max_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
+; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT:    v_and_or_b32 v86, v24, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v96, v7, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
+; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
+; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_max_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_max_f32_e32 v18, v84, v83
-; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_and_or_b32 v84, v8, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v98, v23, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v100, v6, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v112, v5, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v114, v21, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
+; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v4, v4, v20
 ; GFX11-NEXT:    v_max_f32_e32 v20, v80, v71
-; GFX11-NEXT:    v_dual_max_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_max_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_max_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_max_f32_e32 v28, v48, v39
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_max_f32 v27, v50, v49 :: v_dual_max_f32 v26, v52, v51
-; GFX11-NEXT:    v_dual_max_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-NEXT:    v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_max_f32_e32 v26, v52, v51
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_max_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
-; GFX11-NEXT:    v_and_or_b32 v48, v13, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v36, v14, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v34, v33, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v38, v30, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v50, v29, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
+; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
 ; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v52, v12, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v54, v28, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_or_b32 v64, v11, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v66, v27, s0, 0x400000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
 ; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v68, v10, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v70, v26, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
+; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_or_b32 v80, v9, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v82, v25, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
+; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
+; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v102, v22, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v116, v4, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
+; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_and_or_b32 v118, v20, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v130, v19, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
+; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v134, v18, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
+; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v146, v17, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
 ; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v33, v0, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v132, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v128, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
@@ -25477,22 +24869,21 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v17, v32, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
 ; GFX11-NEXT:    v_max_f32_e32 v15, v15, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_or_b32 v21, v15, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -25586,8 +24977,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25617,9 +25007,8 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25641,13 +25030,12 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s4
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -25674,9 +25062,8 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
@@ -25684,7 +25071,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -25724,8 +25111,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25738,9 +25124,8 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25750,10 +25135,9 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -25764,11 +25148,10 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -25816,8 +25199,7 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25831,9 +25213,8 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25844,11 +25225,10 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
@@ -25947,8 +25327,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25978,9 +25357,8 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25990,7 +25368,6 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -26004,7 +25381,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26015,7 +25392,6 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
@@ -26035,7 +25411,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -26097,8 +25473,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26119,9 +25494,8 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26131,7 +25505,6 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -26139,7 +25512,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26150,7 +25523,6 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
@@ -26161,7 +25533,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -26257,8 +25629,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26288,9 +25659,8 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26300,7 +25670,6 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -26314,7 +25683,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26325,7 +25694,6 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
@@ -26345,7 +25713,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -26442,8 +25810,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26473,9 +25840,8 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26485,7 +25851,6 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
@@ -26500,7 +25865,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26511,7 +25876,6 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
@@ -26532,7 +25896,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -26594,8 +25958,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26616,9 +25979,8 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26628,7 +25990,6 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
@@ -26636,7 +25997,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26647,7 +26008,6 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
@@ -26658,7 +26018,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -26752,8 +26112,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26783,9 +26142,8 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26795,7 +26153,6 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
@@ -26810,7 +26167,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26821,7 +26178,6 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
 ; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
@@ -26842,7 +26198,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -26882,8 +26238,7 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26896,9 +26251,8 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_ceil_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26908,10 +26262,9 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_ceil_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26922,11 +26275,10 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ceil_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -26967,8 +26319,7 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26981,9 +26332,8 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26993,10 +26343,9 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27007,11 +26356,10 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27052,8 +26400,7 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27066,9 +26413,8 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27078,10 +26424,9 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27092,11 +26437,10 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27137,8 +26481,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27151,9 +26494,8 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27163,10 +26505,9 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27177,11 +26518,10 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27240,8 +26580,7 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27260,9 +26599,8 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27276,11 +26614,10 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27297,13 +26634,12 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -27343,8 +26679,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27357,9 +26692,8 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27369,10 +26703,9 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27383,11 +26716,10 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27428,8 +26760,7 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27442,9 +26773,8 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_floor_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27454,10 +26784,9 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_floor_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27468,11 +26797,10 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_floor_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27505,8 +26833,7 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27519,9 +26846,8 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27531,10 +26857,9 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27545,11 +26870,10 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -31058,9 +30382,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31072,9 +30395,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31084,9 +30406,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -31097,11 +30418,10 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -31143,16 +30463,14 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31166,15 +30484,13 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31186,12 +30502,11 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
@@ -31205,16 +30520,15 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -31263,26 +30577,22 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v4, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31294,25 +30604,22 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v4
-; GFX9-NEXT:    v_add3_u32 v2, v2, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31325,24 +30632,23 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i16> %x to <3 x bfloat>
@@ -31393,31 +30699,27 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v5, 16, 1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -31430,32 +30732,28 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v5
-; GFX9-NEXT:    v_add3_u32 v3, v3, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v4, s4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31468,30 +30766,29 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX10-NEXT:    v_bfe_u32 v8, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v3, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v10, v0, 16, 1
-; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v11, v0, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -31499,37 +30796,39 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_bfe_i32 v3, v0, 0, 16
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v6, v3, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_bfe_u32 v10, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i16> %x to <4 x bfloat>
@@ -31557,9 +30856,8 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31571,9 +30869,8 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31583,9 +30880,8 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -31596,10 +30892,9 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -31636,16 +30931,14 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -31659,15 +30952,13 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31679,12 +30970,11 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -31698,13 +30988,12 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -31749,23 +31038,20 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -31782,21 +31068,18 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31809,24 +31092,23 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i32> %x to <3 x bfloat>
@@ -31869,31 +31151,27 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -31910,28 +31188,24 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31944,30 +31218,29 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v9, v0, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v11, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v4, v10, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v3, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v7, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -31976,32 +31249,32 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v9, v0, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v6, v3, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v4, v10, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v7, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i32> %x to <4 x bfloat>
@@ -32063,8 +31336,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -32087,9 +31359,8 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -32100,7 +31371,6 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
@@ -32112,7 +31382,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -32124,7 +31394,6 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, v0, v1
 ; GFX11-NEXT:    v_cls_i32_e32 v3, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v3, -1, v3
@@ -32141,7 +31410,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -32240,22 +31509,20 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v7, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v7
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -32285,21 +31552,19 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX9-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -32313,7 +31578,6 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_xor_b32_e32 v5, v2, v3
 ; GFX10-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX10-NEXT:    v_ffbh_i32_e32 v7, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
@@ -32336,9 +31600,9 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -32354,9 +31618,10 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    v_xor_b32_e32 v5, v2, v3
 ; GFX11-NEXT:    v_cls_i32_e32 v6, v1
 ; GFX11-NEXT:    v_cls_i32_e32 v7, v3
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v6, -1, v6
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, -1, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -32385,9 +31650,9 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -32515,23 +31780,22 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_min_u32_e32 v7, v7, v8
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v6, v2, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_ffbh_i32_e32 v5, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, -1, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 32, v6
+; GFX8-NEXT:    v_min_u32_e32 v5, v5, v6
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v7
-; GFX8-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX8-NEXT:    v_ffbh_i32_e32 v6, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
-; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v4
 ; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
@@ -32539,17 +31803,15 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
 ; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -32565,30 +31827,29 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
 ; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    v_ffbh_i32_e32 v7, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_xor_b32_e32 v7, v0, v1
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX9-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
+; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
 ; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v5, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v0, v5, 16, 1
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
@@ -32597,21 +31858,19 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v5
+; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -32638,7 +31897,6 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_min_u32_e32 v8, v10, v8
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v6, v6, v7
 ; GFX10-NEXT:    v_min_u32_e32 v7, v11, v9
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
@@ -32660,13 +31918,13 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
 ; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
@@ -32820,10 +32078,9 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v11, v4, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v11, v[6:7]
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v10
+; GFX8-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v8
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
@@ -32836,7 +32093,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
 ; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v11
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
 ; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
@@ -32844,8 +32101,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
@@ -32858,22 +32114,20 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v9, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
+; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v9
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
@@ -32905,34 +32159,32 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v11, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v11, v[6:7]
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v10
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v7, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX9-NEXT:    v_ffbh_i32_e32 v8, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
-; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
-; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
-; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v10
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v6, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v0, v6, 16, 1
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
@@ -32941,21 +32193,19 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v9, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
+; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v9
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -32989,7 +32239,6 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_add_nc_u32_e32 v13, -1, v13
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
 ; GFX10-NEXT:    v_min_u32_e32 v9, v12, v9
 ; GFX10-NEXT:    v_min_u32_e32 v11, v13, v14
@@ -33015,21 +32264,21 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
 ; GFX10-NEXT:    v_ldexp_f32 v3, v3, v4
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
 ; GFX10-NEXT:    v_add3_u32 v4, v7, v2, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v9, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v5, v7, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc_lo
@@ -33065,16 +32314,15 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
 ; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v13
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_u32_e32 v11, v13, v14
 ; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX11-NEXT:    v_min_u32_e32 v5, 1, v6
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
@@ -33096,21 +32344,21 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
 ; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX11-NEXT:    v_and_or_b32 v5, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
 ; GFX11-NEXT:    v_add3_u32 v4, v7, v2, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v4, v6, v3, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v5, v7, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -33148,9 +32396,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33162,9 +32409,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33174,9 +32420,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -33187,11 +32432,10 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -33233,16 +32477,14 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33256,15 +32498,13 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33276,12 +32516,11 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
@@ -33295,16 +32534,15 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -33357,22 +32595,19 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v2, v4, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33384,25 +32619,22 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v4
-; GFX9-NEXT:    v_add3_u32 v2, v2, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33415,17 +32647,16 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -33484,30 +32715,26 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v5, 16, 1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -33520,32 +32747,28 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v5
-; GFX9-NEXT:    v_add3_u32 v3, v3, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v4, s4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33558,23 +32781,22 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v8, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v9, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v10, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v11, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v7, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -33589,17 +32811,16 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v5, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v7, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33610,11 +32831,11 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v10, v0, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_or_b32 v11, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -33652,9 +32873,8 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33666,9 +32886,8 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33678,9 +32897,8 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -33691,10 +32909,9 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -33731,16 +32948,14 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -33754,15 +32969,13 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33774,12 +32987,11 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -33793,13 +33005,12 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -33844,23 +33055,20 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -33877,21 +33085,18 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33904,24 +33109,23 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i32> %x to <3 x bfloat>
@@ -33964,31 +33168,27 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -34005,28 +33205,24 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -34039,30 +33235,29 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v9, v0, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v11, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v4, v10, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v3, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v7, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -34071,32 +33266,32 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v9, v0, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v6, v3, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v4, v10, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v7, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i32> %x to <4 x bfloat>
@@ -34146,8 +33341,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -34166,9 +33360,8 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -34178,7 +33371,6 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
@@ -34187,7 +33379,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -34198,7 +33390,6 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
@@ -34211,7 +33402,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -34284,22 +33475,20 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v0
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT:    v_min_u32_e32 v7, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v7
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -34321,21 +33510,19 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX9-NEXT:    v_bfe_u32 v0, v4, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v5, v0, v4, s4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -34347,7 +33534,6 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX10-NEXT:    v_min_u32_e32 v5, 32, v5
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
@@ -34364,9 +33550,9 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -34380,7 +33566,6 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v1
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v3
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX11-NEXT:    v_min_u32_e32 v5, 32, v5
@@ -34404,9 +33589,9 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -34503,16 +33688,15 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_ffbh_u32_e32 v6, v3
-; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX8-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX8-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v7
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v4
@@ -34522,17 +33706,15 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
 ; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -34545,44 +33727,41 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
 ; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
 ; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v5, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v0, v5, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v6, v0, v5, s4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v5
+; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -34596,7 +33775,6 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_ffbh_u32_e32 v6, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v8, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v7, v5
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v7, 32, v7
@@ -34620,13 +33798,13 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v7
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
@@ -34739,19 +33917,18 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v4
 ; GFX8-NEXT:    v_ffbh_u32_e32 v4, v7
-; GFX8-NEXT:    v_min_u32_e32 v11, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v11, v[6:7]
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
+; GFX8-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v8
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_ffbh_u32_e32 v8, v1
 ; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v10
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v11
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
 ; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
@@ -34759,8 +33936,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
@@ -34769,22 +33945,20 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v0
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT:    v_min_u32_e32 v9, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v9
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
@@ -34808,49 +33982,45 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX9-NEXT:    v_bfe_u32 v4, v8, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v9, v4, v8, s4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
-; GFX9-NEXT:    v_min_u32_e32 v11, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v11, v[6:7]
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_ffbh_u32_e32 v8, v1
+; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v1
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v11
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v10
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v6, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v0, v6, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v7, v0, v6, s4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v9, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v9
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -34865,7 +34035,6 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    v_ffbh_u32_e32 v10, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v11, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v9, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX10-NEXT:    v_min_u32_e32 v11, 32, v11
@@ -34894,27 +34063,27 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v6
 ; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_ldexp_f32 v4, v4, v9
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v3, v9, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v4, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -34925,9 +34094,10 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v1
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v3
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -34948,41 +34118,42 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v11
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v4
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v8
+; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v8
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v4, v4, v9
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v5, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v3, v9, v1, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v9, v4, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v9, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i64> %x to <4 x bfloat>
@@ -40088,8 +39259,7 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40104,9 +39274,8 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40118,10 +39287,9 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -40134,11 +39302,10 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
@@ -40206,16 +39373,14 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40234,16 +39399,14 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -40259,14 +39422,13 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v5, v4
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v0, v3, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v3
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX10-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
@@ -40284,15 +39446,14 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v0, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_or_b32 v5, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -40375,8 +39536,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
@@ -40390,16 +39550,14 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40416,9 +39574,8 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
@@ -40429,16 +39586,14 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -40460,16 +39615,15 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v1, v6, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v3, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v0, v5, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v8, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
 ; GFX10-NEXT:    v_add3_u32 v0, v0, v5, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -40572,17 +39726,15 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
@@ -40595,16 +39747,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -40625,16 +39775,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
@@ -40645,16 +39793,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -40681,22 +39827,21 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX10-NEXT:    v_bfe_u32 v10, v6, 16, 1
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
 ; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v8
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v6
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_add3_u32 v0, v10, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v6, s4, 0x400000
 ; GFX10-NEXT:    v_bfe_u32 v2, v5, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v9, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v0, v2, v5, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v2, v3, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v6, v8, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
@@ -40717,14 +39862,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_fmac_f32_e32 v5, v1, v3
 ; GFX11-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v1, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -40736,14 +39880,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX11-NEXT:    v_fmac_f32_e32 v7, v9, v8
 ; GFX11-NEXT:    v_bfe_u32 v8, v4, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v0, v2, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v9, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v3, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v6, v8, v4, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v3, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v3, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
@@ -40803,8 +39947,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -40813,8 +39956,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40828,18 +39970,16 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40850,10 +39990,9 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc_lo
@@ -40861,7 +40000,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -40873,11 +40012,10 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v3, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -40887,7 +40025,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -40958,8 +40096,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
@@ -40971,16 +40108,14 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -40989,8 +40124,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -41005,36 +40139,32 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -41048,14 +40178,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v3, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v5, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v6, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
@@ -41066,13 +40195,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -41086,19 +40215,17 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v5, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v6, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -41109,7 +40236,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
@@ -41117,7 +40244,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
@@ -41210,8 +40337,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -41221,8 +40347,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -41231,8 +40356,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
@@ -41243,16 +40367,14 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -41261,8 +40383,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -41278,54 +40399,48 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -41343,41 +40458,40 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v8, v1, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v3, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v10, v0, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v8, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v10, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_or_b32 v7, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
@@ -41492,8 +40606,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
@@ -41505,16 +40618,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -41523,8 +40634,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -41533,8 +40643,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
@@ -41545,16 +40654,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -41563,8 +40670,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -41581,72 +40687,64 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
 ; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -41667,45 +40765,44 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v3, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v7, v9, v7
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_bfe_u32 v9, v7, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v11, v0, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_and_or_b32 v10, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX10-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT:    v_and_or_b32 v12, v0, s4, 0x400000
 ; GFX10-NEXT:    v_add3_u32 v11, v11, v0, 0x7fff
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v8
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc_lo
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_and_or_b32 v5, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_add3_u32 v4, v7, v3, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
@@ -41713,10 +40810,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v6, v1, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v5, v7, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
@@ -41736,7 +40833,6 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
@@ -41744,20 +40840,20 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
 ; GFX11-NEXT:    v_mul_f32_e32 v7, v9, v7
-; GFX11-NEXT:    v_and_or_b32 v3, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
 ; GFX11-NEXT:    v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_or_b32 v10, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX11-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v11, v0, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v12, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add3_u32 v11, v11, v0, 0x7fff
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4
@@ -41769,7 +40865,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -41781,7 +40877,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v5, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v4, v7, v3, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
@@ -41789,10 +40885,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v4, v6, v1, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v5, v7, v2, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index d35871e..99b163d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -790,8 +790,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -806,9 +805,8 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index 9142858..5889de7 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -1524,9 +1524,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX900-NEXT:    v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX900-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1566,9 +1565,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX908-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1608,9 +1606,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX90A-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX90A-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; GFX90A-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1632,7 +1629,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0xff800000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s0, s2, -4
 ; GFX10-NEXT:    s_mov_b32 s1, s3
@@ -1650,7 +1646,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s5, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1673,7 +1669,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s5, 0xff800000
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s0, s2, -4
@@ -1694,7 +1689,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s5, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1744,9 +1739,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX900-NEXT:    v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX900-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1786,9 +1780,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX908-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1828,9 +1821,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX90A-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX90A-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; GFX90A-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1854,7 +1846,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0xff800000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s0, s2, -4
 ; GFX10-NEXT:    s_mov_b32 s1, s3
@@ -1872,7 +1863,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s5, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1895,7 +1886,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX11-LABEL: global_atomic_fadd_ret_bf16_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s5, 0xff800000
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s0, s2, -4
@@ -1916,7 +1906,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s5, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 6a7fb71..ba946fe 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -912,10 +912,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
   ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -934,10 +933,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
   ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -956,10 +954,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX10-WF32-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
   ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
   ; DAGISEL-GFX10-WF32-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX10-WF32-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
-  ; DAGISEL-GFX10-WF32-NEXT:   [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
   ; DAGISEL-GFX10-WF32-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX10-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
   ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -978,10 +975,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX10-WF64-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
   ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
   ; DAGISEL-GFX10-WF64-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX10-WF64-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
-  ; DAGISEL-GFX10-WF64-NEXT:   [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
   ; DAGISEL-GFX10-WF64-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX10-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
   ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 48ae98f..5e76dfd 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1413,9 +1413,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, v4, v2
@@ -1451,9 +1450,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s6
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1560,9 +1558,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    v_add_f32_e32 v4, 4.0, v4
 ; VI-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
-; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, v3, v2
@@ -1597,9 +1594,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_add_f32_e32 v4, 4.0, v4
 ; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 672c93b..66c49ba 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -4259,65 +4259,57 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_fma_f32 v7, v8, v9, v7
 ; GFX9-NEXT:    v_fma_f32 v1, v8, v5, v1
-; GFX9-NEXT:    v_fma_f32 v8, v12, v9, v11
 ; GFX9-NEXT:    v_fma_f32 v2, v12, v5, v2
 ; GFX9-NEXT:    v_bfe_u32 v5, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v7
+; GFX9-NEXT:    v_fma_f32 v8, v12, v9, v11
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v7
 ; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v7, s2
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_or_b32_e32 v12, 0x400000, v1
 ; GFX9-NEXT:    v_bfe_u32 v13, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xff800000, v8
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v1, s2
-; GFX9-NEXT:    v_or_b32_e32 v12, 0x400000, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_or_b32_e32 v14, 0x400000, v8
 ; GFX9-NEXT:    v_bfe_u32 v15, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v8, s2
-; GFX9-NEXT:    v_or_b32_e32 v14, 0x400000, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v2
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v2, s2
-; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v15, v16, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v15, v16, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_fma_f32 v1, v3, v10, v1
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_fma_f32 v1, v3, v10, v1
 ; GFX9-NEXT:    v_fma_f32 v3, v3, v6, v5
+; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX9-NEXT:    v_fma_f32 v2, v4, v10, v2
 ; GFX9-NEXT:    v_fma_f32 v4, v4, v6, v7
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v1
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s2
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX9-NEXT:    v_bfe_u32 v9, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s2
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v2
 ; GFX9-NEXT:    v_bfe_u32 v11, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v2, s2
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v4, s2
-; GFX9-NEXT:    v_or_b32_e32 v12, 0x400000, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v11, v12, vcc
@@ -4332,7 +4324,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
 ; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
-; GFX10-NEXT:    s_mov_b32 s2, 0xff800000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
@@ -4355,20 +4346,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v9
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v12, v4
 ; GFX10-NEXT:    v_bfe_u32 v4, v7, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v8, v7, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v9, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT:    v_and_or_b32 v12, v0, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v15, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v13, v11, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v16, v1, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v1, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v14, v11, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v11
 ; GFX10-NEXT:    v_add3_u32 v13, v13, v11, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -4382,7 +4373,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v10
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v8, v4, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v10
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -4390,14 +4381,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX10-NEXT:    v_fmac_f32_e32 v7, v3, v5
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s2, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v10, v1, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v1
 ; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v11, v7, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_and_or_b32 v12, v7, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
@@ -4416,7 +4407,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3]
@@ -4438,11 +4428,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v14, v11, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v15, v1, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v9
-; GFX11-NEXT:    v_and_or_b32 v16, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v13, v13, v11, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v1, 0x7fff
@@ -4450,11 +4440,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_fmac_f32_e32 v0, v8, v4
 ; GFX11-NEXT:    v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v8, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
 ; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v12, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
@@ -4466,7 +4456,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v8, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -4480,14 +4470,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v12, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
 ; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v10
-; GFX11-NEXT:    v_and_or_b32 v10, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v3, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
index 9d7570b..d51c955 100644
--- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
@@ -160,10 +160,8 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = cmp.gtu(r3:2,r5:4)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r8 = mux(p0,r8,r1)
-; CHECK-NEXT:     r9 = mux(p0,r9,r1)
+; CHECK-NEXT:     if (!p0.new) r8 = add(r1,#0)
+; CHECK-NEXT:     if (!p0.new) r9 = add(r1,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     memd_locked(r0,p0) = r9:8
diff --git a/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir b/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir
new file mode 100644
index 0000000..ef84043
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir
@@ -0,0 +1,21 @@
+# RUN: llc -march=hexagon -run-pass=hexagon-bit-simplify -o - %s | FileCheck %s
+
+# This test checks if the HexagonBitSimplify pass correctly replaces a
+# S2_storerh_io with a S2_storerf_io that stores the upper halfword
+# of a high subregister using appropriate subregister boundaries.
+
+# CHECK: S2_storerf_io %0, 28, %{{[0-9]+}}.isub_hi
+# CHECK-NOT: S2_storerf_io %0, 28, %{{[0-9]+}}.isub_lo
+
+---
+name: test_store
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0
+    %0:intregs = COPY $r0
+    %1:doubleregs = IMPLICIT_DEF
+    %2:doubleregs = IMPLICIT_DEF
+    %3:doubleregs = S2_shuffoh %2, %1
+    S2_storerh_io %0, 28, %3.isub_hi
+...
diff --git a/llvm/test/CodeGen/Hexagon/isel/select-vec.ll b/llvm/test/CodeGen/Hexagon/isel/select-vec.ll
index 4e54aa4..7073c1a 100644
--- a/llvm/test/CodeGen/Hexagon/isel/select-vec.ll
+++ b/llvm/test/CodeGen/Hexagon/isel/select-vec.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
 
 define <4 x i8> @f0(<4 x i8> %a0, <4 x i8> %a1, i32 %a2) #0 {
 ; CHECK-LABEL: f0:
diff --git a/llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll b/llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll
new file mode 100644
index 0000000..6f9e83c23
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll
@@ -0,0 +1,68 @@
+; RUN: llc -march=hexagon -O3 -hexagon-small-data-threshold=0 < %s | FileCheck %s
+; This test checks the case if there are more than 2 uses of a constan address, move the
+; value in to a register and replace all instances of constant with the register.
+; The GenMemAbsolute pass generates a absolute-set instruction if there are more
+; than 2 uses of this register.
+
+; CHECK: loadi32_3
+; CHECK-NOT: r{{[0-9]+}} = memw(##441652)
+; CHECK-NOT: r{{[0-9]+}} = memw(r{{[0-9]+}}+#0)
+; CHECK:r{{[0-9]+}} = memw(r[[REG:[0-9]+]]=##441652)
+; CHECK-NOT: r{{[0-9]+}} = {emw(##441652)
+; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0)
+; CHECK-NOT: r{{[0-9]+}} = memw(##441652)
+; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0)
+; CHECK-NOT: r{{[0-9]+}} = memw(##441652)
+
+define void @loadi32_3() #0 {
+entry:
+  %0 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  %1 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  %2 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  ret void
+}
+
+; CHECK: loadi32_2
+; CHECK-NOT: r{{[0-9]+}} = ##441652
+; CHECK: r{{[0-9]+}} = memw(##441652)
+; CHECK: r{{[0-9]+}} = memw(##441652)
+
+define void @loadi32_2() #0 {
+entry:
+  %0 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  %1 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  ret void
+}
+
+; CHECK: loadi32_abs_global_3
+; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt)
+; CHECK-NOT: r{{[0-9]+}} = memw(r{{[0-9]+}}+#0)
+; CHECK:r{{[0-9]+}} = memw(r[[REG:[0-9]+]]=##globalInt)
+; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt)
+; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0)
+; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt)
+; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0)
+; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt)
+
+@globalInt = external global i32, align 8
+define void @loadi32_abs_global_3() #0 {
+entry:
+  %0 = load volatile i32, ptr @globalInt, align 4
+  %1 = load volatile i32, ptr @globalInt, align 4
+  %2 = load volatile i32, ptr @globalInt, align 4
+  ret void
+}
+
+; CHECK: loadi32_abs_global_2
+; CHECK-NOT:r[[REG:[0-9]+]] = ##globalInt
+; CHECK:r{{[0-9]+}} = memw(##globalInt)
+; CHECK:r{{[0-9]+}} = memw(##globalInt)
+
+define void @loadi32_abs_global_2() #0 {
+entry:
+  %0 = load volatile i32, ptr @globalInt, align 4
+  %1 = load volatile i32, ptr @globalInt, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/post-inc-vec.mir b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
new file mode 100644
index 0000000..3788dc3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
@@ -0,0 +1,413 @@
+#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
+
+# Test that we do not generate two post-increment vector load/store
+# in the loop.
+# CHECK: J2_loop0r
+# CHECK: V6_vS32b_pi
+# CHECK-NOT: = V6_vL32b_pi
+# CHECK: V6_vL32b_ai
+# CHECK: V6_vL32b_ai
+# CHECK: V6_vS32b_ai
+# CHECK: ENDLOOP0
+
+--- |
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <1024 x i1> @llvm.hexagon.V6.pred.scalar2v2.128B(i32) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+  declare void @llvm.hexagon.V6.vS32b.qpred.ai.128B(<1024 x i1>, ptr, <32 x i32>) #1
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32>, <32 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32>, <32 x i32>, i32) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32>, i32) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32>, <32 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32>, <64 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+  declare void @llvm.assume(i1 noundef) #2
+
+  ; Function Attrs: noinline nounwind
+  define void @blah(i32 %0, i32 %1, ptr noalias %2, ptr noalias nocapture readonly %3, ptr noalias nocapture readonly %4, ptr nocapture readnone %5, ptr nocapture readnone %6, i32 %7, i32 %8, ptr nocapture readonly %9, ptr nocapture readonly %10) local_unnamed_addr #3 {
+  entry:
+    %11 = call i32 @llvm.hexagon.S2.extractu(i32 %0, i32 23, i32 9)
+    %12 = shl i32 %11, 7
+    %mul16.i = mul nsw i32 %12, %1
+    %add.i = add nsw i32 %1, 1
+    %mul17.i = mul nsw i32 %add.i, %12
+    %cmp184.i = icmp slt i32 %mul16.i, %mul17.i
+    br i1 %cmp184.i, label %for.body.lr.ph.i, label %for.end.i
+
+  for.body.lr.ph.i:                                 ; preds = %entry
+    %13 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> <i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576>, <32 x i32> <i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144>) #5
+    %14 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> zeroinitializer, <32 x i32> zeroinitializer) #5
+    %15 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32) #5
+    %cgep = getelementptr i8, ptr %2, i32 %mul16.i
+    %cgep8 = getelementptr i8, ptr %4, i32 %mul16.i
+    %cgep9 = getelementptr i8, ptr %3, i32 %mul16.i
+    br label %for.body.i
+
+  for.body.i:                                       ; preds = %for.body.i, %for.body.lr.ph.i
+    %lsr.iv6 = phi ptr [ %cgep12, %for.body.i ], [ %cgep9, %for.body.lr.ph.i ]
+    %lsr.iv3 = phi ptr [ %cgep11, %for.body.i ], [ %cgep8, %for.body.lr.ph.i ]
+    %lsr.iv = phi ptr [ %cgep10, %for.body.i ], [ %cgep, %for.body.lr.ph.i ]
+    %elemIdx.05.i = phi i32 [ %mul16.i, %for.body.lr.ph.i ], [ %add19.i, %for.body.i ]
+    %16 = load <32 x i32>, ptr %lsr.iv6, align 128
+    %17 = load <32 x i32>, ptr %lsr.iv3, align 128
+    %18 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %17, <32 x i32> %16) #5
+    %19 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %13, <64 x i32> %18) #5
+    %20 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %14, <64 x i32> %18) #5
+    %21 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %19) #5
+    %22 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %20) #5
+    %23 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %22, i32 7) #5
+    %24 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %21, <32 x i32> %23) #5
+    %25 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %19) #5
+    %26 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %20) #5
+    %27 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %26, i32 7) #5
+    %28 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %25, <32 x i32> %27) #5
+    %29 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %24, <32 x i32> %15) #5
+    %30 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %28, <32 x i32> %15) #5
+    %31 = tail call <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32> %29, <32 x i32> %30, i32 4) #5
+    store <32 x i32> %31, ptr %lsr.iv, align 128
+    %add19.i = add nsw i32 %elemIdx.05.i, 128
+    %cmp18.i = icmp slt i32 %add19.i, %mul17.i
+    %cgep10 = getelementptr i8, ptr %lsr.iv, i32 128
+    %cgep11 = getelementptr i8, ptr %lsr.iv3, i32 128
+    %cgep12 = getelementptr i8, ptr %lsr.iv6, i32 128
+    br i1 %cmp18.i, label %for.body.i, label %for.end.i
+
+  for.end.i:                                        ; preds = %for.body.i, %entry
+    ret void
+  }
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+  declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare i32 @llvm.hexagon.S2.extractu(i32, i32 immarg, i32 immarg) #0
+
+  attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+  attributes #1 = { nocallback nofree nosync nounwind willreturn memory(write) }
+  attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+  attributes #3 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+hvx-length128b,+hvxv68,+v68,-long-calls,-small-data" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+  attributes #5 = { nounwind }
+
+...
+---
+name:            blah
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: intregs, preferred-register: '' }
+  - { id: 1, class: intregs, preferred-register: '' }
+  - { id: 2, class: hvxwr, preferred-register: '' }
+  - { id: 3, class: hvxwr, preferred-register: '' }
+  - { id: 4, class: hvxvr, preferred-register: '' }
+  - { id: 5, class: intregs, preferred-register: '' }
+  - { id: 6, class: intregs, preferred-register: '' }
+  - { id: 7, class: intregs, preferred-register: '' }
+  - { id: 8, class: intregs, preferred-register: '' }
+  - { id: 9, class: intregs, preferred-register: '' }
+  - { id: 10, class: intregs, preferred-register: '' }
+  - { id: 11, class: intregs, preferred-register: '' }
+  - { id: 12, class: intregs, preferred-register: '' }
+  - { id: 13, class: intregs, preferred-register: '' }
+  - { id: 14, class: intregs, preferred-register: '' }
+  - { id: 15, class: intregs, preferred-register: '' }
+  - { id: 16, class: intregs, preferred-register: '' }
+  - { id: 17, class: intregs, preferred-register: '' }
+  - { id: 18, class: intregs, preferred-register: '' }
+  - { id: 19, class: intregs, preferred-register: '' }
+  - { id: 20, class: intregs, preferred-register: '' }
+  - { id: 21, class: intregs, preferred-register: '' }
+  - { id: 22, class: intregs, preferred-register: '' }
+  - { id: 23, class: intregs, preferred-register: '' }
+  - { id: 24, class: intregs, preferred-register: '' }
+  - { id: 25, class: predregs, preferred-register: '' }
+  - { id: 26, class: predregs, preferred-register: '' }
+  - { id: 27, class: hvxvr, preferred-register: '' }
+  - { id: 28, class: intregs, preferred-register: '' }
+  - { id: 29, class: hvxvr, preferred-register: '' }
+  - { id: 30, class: intregs, preferred-register: '' }
+  - { id: 31, class: hvxvr, preferred-register: '' }
+  - { id: 32, class: intregs, preferred-register: '' }
+  - { id: 33, class: hvxvr, preferred-register: '' }
+  - { id: 34, class: hvxvr, preferred-register: '' }
+  - { id: 35, class: hvxwr, preferred-register: '' }
+  - { id: 36, class: hvxwr, preferred-register: '' }
+  - { id: 37, class: hvxwr, preferred-register: '' }
+  - { id: 38, class: hvxvr, preferred-register: '' }
+  - { id: 39, class: hvxvr, preferred-register: '' }
+  - { id: 40, class: intregs, preferred-register: '' }
+  - { id: 41, class: hvxvr, preferred-register: '' }
+  - { id: 42, class: hvxvr, preferred-register: '' }
+  - { id: 43, class: hvxvr, preferred-register: '' }
+  - { id: 44, class: hvxvr, preferred-register: '' }
+  - { id: 45, class: hvxvr, preferred-register: '' }
+  - { id: 46, class: hvxvr, preferred-register: '' }
+  - { id: 47, class: hvxvr, preferred-register: '' }
+  - { id: 48, class: hvxvr, preferred-register: '' }
+  - { id: 49, class: intregslow8, preferred-register: '' }
+  - { id: 50, class: hvxvr, preferred-register: '' }
+  - { id: 51, class: predregs, preferred-register: '' }
+  - { id: 52, class: intregs, preferred-register: '' }
+  - { id: 53, class: intregs, preferred-register: '' }
+  - { id: 54, class: intregs, preferred-register: '' }
+  - { id: 55, class: intregs, preferred-register: '' }
+  - { id: 56, class: intregs, preferred-register: '' }
+  - { id: 57, class: intregs, preferred-register: '' }
+  - { id: 58, class: intregs, preferred-register: '' }
+  - { id: 59, class: intregs, preferred-register: '' }
+  - { id: 60, class: intregs, preferred-register: '' }
+  - { id: 61, class: hvxvr, preferred-register: '' }
+  - { id: 62, class: intregs, preferred-register: '' }
+  - { id: 63, class: hvxvr, preferred-register: '' }
+  - { id: 64, class: intregs, preferred-register: '' }
+  - { id: 65, class: hvxwr, preferred-register: '' }
+  - { id: 66, class: hvxwr, preferred-register: '' }
+  - { id: 67, class: hvxwr, preferred-register: '' }
+  - { id: 68, class: hvxvr, preferred-register: '' }
+  - { id: 69, class: hvxvr, preferred-register: '' }
+  - { id: 70, class: hvxvr, preferred-register: '' }
+  - { id: 71, class: hvxvr, preferred-register: '' }
+  - { id: 72, class: hvxvr, preferred-register: '' }
+  - { id: 73, class: hvxvr, preferred-register: '' }
+  - { id: 74, class: hvxvr, preferred-register: '' }
+  - { id: 75, class: intregs, preferred-register: '' }
+  - { id: 76, class: intregs, preferred-register: '' }
+  - { id: 77, class: intregs, preferred-register: '' }
+  - { id: 78, class: intregs, preferred-register: '' }
+  - { id: 79, class: hvxvr, preferred-register: '' }
+  - { id: 80, class: intregs, preferred-register: '' }
+  - { id: 81, class: hvxvr, preferred-register: '' }
+  - { id: 82, class: intregs, preferred-register: '' }
+  - { id: 83, class: hvxwr, preferred-register: '' }
+  - { id: 84, class: hvxwr, preferred-register: '' }
+  - { id: 85, class: hvxwr, preferred-register: '' }
+  - { id: 86, class: hvxvr, preferred-register: '' }
+  - { id: 87, class: hvxvr, preferred-register: '' }
+  - { id: 88, class: hvxvr, preferred-register: '' }
+  - { id: 89, class: hvxvr, preferred-register: '' }
+  - { id: 90, class: hvxvr, preferred-register: '' }
+  - { id: 91, class: hvxvr, preferred-register: '' }
+  - { id: 92, class: hvxvr, preferred-register: '' }
+  - { id: 93, class: intregs, preferred-register: '' }
+  - { id: 94, class: intregs, preferred-register: '' }
+  - { id: 95, class: intregs, preferred-register: '' }
+  - { id: 96, class: intregs, preferred-register: '' }
+  - { id: 97, class: predregs, preferred-register: '' }
+  - { id: 98, class: predregs, preferred-register: '' }
+liveins:
+  - { reg: '$r0', virtual-reg: '%16' }
+  - { reg: '$r1', virtual-reg: '%17' }
+  - { reg: '$r2', virtual-reg: '%18' }
+  - { reg: '$r3', virtual-reg: '%19' }
+  - { reg: '$r4', virtual-reg: '%20' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+  - { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, type: default, offset: 20, size: 4, alignment: 4, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, type: default, offset: 16, size: 4, alignment: 8, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, type: default, offset: 12, size: 4, alignment: 4, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    liveins: $r0, $r1, $r2, $r3, $r4
+
+    %20:intregs = COPY $r4
+    %19:intregs = COPY $r3
+    %18:intregs = COPY $r2
+    %17:intregs = COPY $r1
+    %16:intregs = COPY $r0
+    %22:intregs = S2_extractu %16, 23, 9
+    %23:intregs = S2_asl_i_r %22, 7
+    %0:intregs = nsw M2_mpyi %23, %17
+    %24:intregs = nsw A2_addi %17, 1
+    %1:intregs = nsw M2_mpyi %24, %23
+    %25:predregs = C2_cmpgt %1, %0
+    J2_jumpf %25, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1.for.body.lr.ph.i:
+    successors: %bb.4(0x40000000), %bb.6(0x40000000)
+
+    %28:intregs = A2_tfrsi 269488144
+    %27:hvxvr = V6_lvsplatw %28
+    %30:intregs = A2_tfrsi 1077952576
+    %29:hvxvr = V6_lvsplatw %30
+    %2:hvxwr = REG_SEQUENCE %29, %subreg.vsub_hi, %27, %subreg.vsub_lo
+    %31:hvxvr = V6_vd0
+    %3:hvxwr = REG_SEQUENCE %31, %subreg.vsub_hi, %31, %subreg.vsub_lo
+    %32:intregs = A2_tfrsi 32
+    %4:hvxvr = V6_lvsplath %32
+    %5:intregs = A2_add %18, %0
+    %6:intregs = A2_add %20, %0
+    %7:intregs = A2_add %19, %0
+    %40:intregs = A2_tfrsi 7
+    %49:intregslow8 = A2_tfrsi 4
+    %52:intregs = A2_sub %1, %0
+    %53:intregs = A2_addi %52, 127
+    %54:intregs = S2_lsr_i_r %53, 7
+    %55:intregs = COPY %54
+    %56:intregs = S2_lsr_i_r %55, 1
+    %57:intregs = A2_andir %55, 1
+    %97:predregs = C2_cmpgtui %56, 0
+    J2_jumpf %97, %bb.6, implicit-def $pc
+    J2_jump %bb.4, implicit-def $pc
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    J2_loop0r %bb.5, %56, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.5, implicit-def $pc
+
+  bb.5:
+    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+    %58:intregs = PHI %7, %bb.4, %80, %bb.5
+    %59:intregs = PHI %6, %bb.4, %82, %bb.5
+    %60:intregs = PHI %5, %bb.4, %93, %bb.5
+    %61:hvxvr, %62:intregs = V6_vL32b_pi %58, 128 :: (load (s1024) from %ir.lsr.iv6)
+    %63:hvxvr, %64:intregs = V6_vL32b_pi %59, 128 :: (load (s1024) from %ir.lsr.iv3)
+    %65:hvxwr = REG_SEQUENCE %63, %subreg.vsub_hi, %61, %subreg.vsub_lo
+    %66:hvxwr = V6_vmpabusv %2, %65
+    %67:hvxwr = V6_vmpabusv %3, %65
+    %68:hvxvr = V6_vasrh %67.vsub_hi, %40
+    %69:hvxvr = V6_vavgh %66.vsub_hi, %68
+    %70:hvxvr = V6_vasrh %67.vsub_lo, %40
+    %71:hvxvr = V6_vavgh %66.vsub_lo, %70
+    %72:hvxvr = V6_vaddhsat %69, %4
+    %73:hvxvr = V6_vaddhsat %71, %4
+    %74:hvxvr = V6_vasrhbsat %72, %73, %49
+    %75:intregs = V6_vS32b_pi %60, 128, %74 :: (store (s1024) into %ir.lsr.iv)
+    %79:hvxvr, %80:intregs = V6_vL32b_pi %62, 128 :: (load (s1024) from %ir.lsr.iv6 + 128)
+    %81:hvxvr, %82:intregs = V6_vL32b_pi %64, 128 :: (load (s1024) from %ir.lsr.iv3 + 128)
+    %83:hvxwr = REG_SEQUENCE %81, %subreg.vsub_hi, %79, %subreg.vsub_lo
+    %84:hvxwr = V6_vmpabusv %2, %83
+    %85:hvxwr = V6_vmpabusv %3, %83
+    %86:hvxvr = V6_vasrh %85.vsub_hi, %40
+    %87:hvxvr = V6_vavgh %84.vsub_hi, %86
+    %88:hvxvr = V6_vasrh %85.vsub_lo, %40
+    %89:hvxvr = V6_vavgh %84.vsub_lo, %88
+    %90:hvxvr = V6_vaddhsat %87, %4
+    %91:hvxvr = V6_vaddhsat %89, %4
+    %92:hvxvr = V6_vasrhbsat %90, %91, %49
+    %93:intregs = V6_vS32b_pi %75, 128, %92 :: (store (s1024) into %ir.lsr.iv + 128)
+    ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.6, implicit-def $pc
+
+  bb.6:
+    successors: %bb.7(0x40000000), %bb.8(0x40000000)
+
+    %94:intregs = PHI %7, %bb.1, %80, %bb.5
+    %95:intregs = PHI %6, %bb.1, %82, %bb.5
+    %96:intregs = PHI %5, %bb.1, %93, %bb.5
+    %98:predregs = C2_cmpgtui %57, 0
+    J2_jumpf %98, %bb.8, implicit-def $pc
+    J2_jump %bb.7, implicit-def $pc
+
+  bb.7:
+    successors: %bb.2(0x80000000)
+
+    J2_jump %bb.2, implicit-def $pc
+
+  bb.2.for.body.i (machine-block-address-taken):
+    successors: %bb.8(0x04000000)
+
+    %33:hvxvr, %15:intregs = V6_vL32b_pi %94, 128 :: (load (s1024) from %ir.lsr.iv6)
+    %34:hvxvr, %14:intregs = V6_vL32b_pi %95, 128 :: (load (s1024) from %ir.lsr.iv3)
+    %35:hvxwr = REG_SEQUENCE %34, %subreg.vsub_hi, %33, %subreg.vsub_lo
+    %36:hvxwr = V6_vmpabusv %2, %35
+    %37:hvxwr = V6_vmpabusv %3, %35
+    %41:hvxvr = V6_vasrh %37.vsub_hi, %40
+    %42:hvxvr = V6_vavgh %36.vsub_hi, %41
+    %45:hvxvr = V6_vasrh %37.vsub_lo, %40
+    %46:hvxvr = V6_vavgh %36.vsub_lo, %45
+    %47:hvxvr = V6_vaddhsat %42, %4
+    %48:hvxvr = V6_vaddhsat %46, %4
+    %50:hvxvr = V6_vasrhbsat %47, %48, %49
+    %13:intregs = V6_vS32b_pi %96, 128, %50 :: (store (s1024) into %ir.lsr.iv)
+    J2_jump %bb.8, implicit-def $pc
+
+  bb.8:
+    successors: %bb.3(0x80000000)
+
+    J2_jump %bb.3, implicit-def $pc
+
+  bb.3.for.end.i:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/post_inc_store.mir b/llvm/test/CodeGen/Hexagon/post_inc_store.mir
new file mode 100644
index 0000000..3e3f51a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/post_inc_store.mir
@@ -0,0 +1,168 @@
+#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
+
+# Test that we convert a post-inc load and store to a regular load and post-inc
+# store.
+# CHECK: J2_loop0r
+# CHECK-NOT: = L2_loadruh_pi
+# CHECK: L2_loadruh_io
+# CHECK: S2_storerh_pi
+# CHECK: ENDLOOP0
+
+--- |
+  ; Function Attrs: nofree norecurse nounwind
+  define dso_local void @blam(i32 %arg, ptr nocapture %arg1, i16 signext %arg2) local_unnamed_addr #0 {
+  bb:
+    %icmp = icmp eq i32 %arg, 0
+    br i1 %icmp, label %bb13, label %bb3
+
+  bb3:                                              ; preds = %bb, %bb10
+    %phi = phi i32 [ %add11, %bb10 ], [ 0, %bb ]
+    %mul = mul i32 %phi, %arg
+    %cgep = getelementptr i16, ptr %arg1, i32 %mul
+    br label %bb4
+
+  bb4:                                              ; preds = %bb4, %bb3
+    %lsr.iv = phi i32 [ %lsr.iv.next, %bb4 ], [ %arg, %bb3 ]
+    %phi5 = phi ptr [ %cgep, %bb3 ], [ %cgep1, %bb4 ]
+    %load = load i16, ptr %phi5, align 2
+    %add = add i16 %load, %arg2
+    store i16 %add, ptr %phi5, align 2
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %icmp8 = icmp eq i32 %lsr.iv.next, 0
+    %cgep1 = getelementptr i16, ptr %phi5, i32 1
+    br i1 %icmp8, label %bb10, label %bb4
+
+  bb10:                                             ; preds = %bb4
+    %add11 = add nuw i32 %phi, 1
+    %icmp12 = icmp eq i32 %add11, %arg
+    br i1 %icmp12, label %bb13, label %bb3
+
+  bb13:                                             ; preds = %bb10, %bb
+    ret void
+  }
+
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+...
+---
+name:            blam
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: intregs, preferred-register: '' }
+  - { id: 1, class: intregs, preferred-register: '' }
+  - { id: 2, class: intregs, preferred-register: '' }
+  - { id: 3, class: intregs, preferred-register: '' }
+  - { id: 4, class: intregs, preferred-register: '' }
+  - { id: 5, class: intregs, preferred-register: '' }
+  - { id: 6, class: intregs, preferred-register: '' }
+  - { id: 7, class: intregs, preferred-register: '' }
+  - { id: 8, class: intregs, preferred-register: '' }
+  - { id: 9, class: intregs, preferred-register: '' }
+  - { id: 10, class: intregs, preferred-register: '' }
+  - { id: 11, class: intregs, preferred-register: '' }
+  - { id: 12, class: predregs, preferred-register: '' }
+  - { id: 13, class: intregs, preferred-register: '' }
+  - { id: 14, class: intregs, preferred-register: '' }
+  - { id: 15, class: intregs, preferred-register: '' }
+  - { id: 16, class: predregs, preferred-register: '' }
+  - { id: 17, class: predregs, preferred-register: '' }
+  - { id: 18, class: predregs, preferred-register: '' }
+  - { id: 19, class: predregs, preferred-register: '' }
+  - { id: 20, class: intregs, preferred-register: '' }
+  - { id: 21, class: intregs, preferred-register: '' }
+liveins:
+  - { reg: '$r0', virtual-reg: '%7' }
+  - { reg: '$r1', virtual-reg: '%8' }
+  - { reg: '$r2', virtual-reg: '%9' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.bb:
+    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %9:intregs = COPY $r2
+    %8:intregs = COPY $r1
+    %7:intregs = COPY $r0
+    %21:intregs = COPY %7
+    %20:intregs = COPY %7
+    %12:predregs = C2_cmpeqi %7, 0
+    J2_jumpt %12, %bb.4, implicit-def $pc
+
+  bb.5:
+    successors: %bb.1(0x80000000)
+
+    %11:intregs = A2_tfrsi 0
+    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
+
+  bb.1.bb3 (machine-block-address-taken):
+    successors: %bb.2(0x80000000)
+
+    %0:intregs = PHI %11, %bb.5, %6, %bb.3
+    %13:intregs = M2_mpyi %0, %7
+    %1:intregs = S2_addasl_rrri %8, %13, 1
+    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+
+  bb.2.bb4 (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+    %3:intregs = PHI %1, %bb.1, %5, %bb.2
+    %14:intregs = L2_loadruh_io %3, 0 :: (load (s16) from %ir.phi5)
+    %15:intregs = A2_add %14, %9
+    %5:intregs = S2_storerh_pi %3, 2, %15 :: (store (s16) into %ir.phi5)
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.3.bb10:
+    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+    %6:intregs = nuw A2_addi %0, 1
+    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
+    J2_jump %bb.4, implicit-def dead $pc
+
+  bb.4.bb13:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/postincopt-crash.mir b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
new file mode 100644
index 0000000..e220534
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
@@ -0,0 +1,58 @@
+# RUN: llc -march=hexagon -run-pass=hexagon-postincopt %s -o /dev/null
+# REQUIRES: asserts
+# Test that we do not hit unreachable code dealt with L4_ior_memoph_io.
+
+...
+---
+name:            foo
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %9:intregs = COPY $r2
+    %8:intregs = COPY $r1
+    %7:intregs = COPY $r0
+    %21:intregs = COPY %7
+    %20:intregs = COPY %7
+    %12:predregs = C2_cmpeqi %7, 0
+    J2_jumpt %12, %bb.4, implicit-def $pc
+
+  bb.5:
+    successors: %bb.1(0x80000000)
+
+    %11:intregs = A2_tfrsi 0
+    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %0:intregs = PHI %11, %bb.5, %6, %bb.3
+    %13:intregs = M2_mpyi %0, %7
+    %1:intregs = S2_addasl_rrri %8, %13, 1
+    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+
+  bb.2:
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+    %3:intregs = PHI %1, %bb.1, %5, %bb.2
+    %14:intregs = L2_loadruh_io %3, 0
+    L4_ior_memoph_io %3:intregs, 0, 21
+    %15:intregs = A2_add %14, %9
+    %5:intregs = S2_storerh_pi %3, 2, %15
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.3:
+    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+    %6:intregs = nuw A2_addi %0, 1
+    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
+    J2_jump %bb.4, implicit-def dead $pc
+
+  bb.4:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
new file mode 100644
index 0000000..27d653c9
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
@@ -0,0 +1,19 @@
+# RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck %s
+# Check that this doesn't crash.
+# CHECK: Y2_dcfetchbo
+
+name: fred
+tracksRegLiveness: true
+body: |
+  bb.0:
+    successors: %bb.1
+    %0:intregs = IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.1
+
+    %1:intregs = PHI %0:intregs, %bb.0, %2:intregs, %bb.1
+    Y2_dcfetchbo %1:intregs, 0
+    %2:intregs = A2_addi %1:intregs, 1
+    J2_jump %bb.1, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/reg-by-name.ll b/llvm/test/CodeGen/Hexagon/reg-by-name.ll
index 4abea83..cc8807e 100644
--- a/llvm/test/CodeGen/Hexagon/reg-by-name.ll
+++ b/llvm/test/CodeGen/Hexagon/reg-by-name.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-tfr-cleanup=0 < %s | FileCheck %s
 
 target triple = "hexagon"
 
@@ -647,7 +647,7 @@ entry:
   ret i32 %1
 }
 
-attributes #0 = { noinline nounwind optnone "target-cpu"="hexagonv62" }
+attributes #0 = { noinline nounwind optnone "target-cpu"="hexagonv73" }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 
diff --git a/llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll b/llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll
new file mode 100644
index 0000000..dccf176
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=hexagon -O3 -hexagon-small-data-threshold=0 < %s | FileCheck %s
+; This test checks the case if there are more than 2 uses of a constan address, move the
+; value in to a register and replace all instances of constant with the register.
+; The GenMemAbsolute pass generates a absolute-set instruction if there are more
+; than 2 uses of this register.
+
+; CHECK: storetrunci32_3
+; CHECK-NOT: memw(##441652) = r{{[0-9]+}}
+; CHECK-NOT: memw(r{{[0-9]+}}+#0) = r{{[0-9]+}}
+; CHECK:memw(r[[REG:[0-9]+]]=##441652) = r{{[0-9]+}}
+; CHECK-NOT: memw(##441652) = r{{[0-9]+}}
+; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}}
+; CHECK-NOT: memw(##441652) = r{{[0-9]+}}
+; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}}
+; CHECK-NOT: memw(##441652) = r{{[0-9]+}}
+
+define void @storetrunci32_3(i64 %descr_addr, i32 %rpm_or_sys, i32 %kkr) #0 {
+entry:
+  %conv = trunc i64 %descr_addr to i32
+  store volatile i32 %conv, ptr inttoptr (i32 441652 to ptr), align 4
+  store volatile i32 %rpm_or_sys, ptr inttoptr (i32 441652 to ptr), align 4
+  store volatile i32 %kkr, ptr inttoptr (i32 441652 to ptr), align 4
+  ret void
+}
+
+; CHECK: storetrunci32_2
+; CHECK-NOT: r{{[0-9]+}} = ##441652
+; CHECK: memw(##441652) = r{{[0-9]+}}
+; CHECK: memw(##441652) = r{{[0-9]+}}
+
+define void @storetrunci32_2(i64 %descr_addr, i32 %rpm_or_sys) #0 {
+entry:
+  %conv = trunc i64 %descr_addr to i32
+  store volatile i32 %conv, ptr inttoptr (i32 441652 to ptr), align 4
+  store volatile i32 %rpm_or_sys, ptr inttoptr (i32 441652 to ptr), align 4
+  ret void
+}
+
+; CHECK: storetrunci32_abs_global_3
+; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}}
+; CHECK-NOT: memw(r{{[0-9]+}}+#0) = r{{[0-9]+}}
+; CHECK:memw(r[[REG:[0-9]+]]=##globalInt) = r{{[0-9]+}}
+; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}}
+; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}}
+; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}}
+; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}}
+; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}}
+
+@globalInt = external global i32, align 8
+define void @storetrunci32_abs_global_3(i64 %descr_addr, i32 %rpm_or_sys, i32 %kkr) #0 {
+entry:
+  %conv = trunc i64 %descr_addr to i32
+  store volatile i32 %conv, ptr @globalInt, align 4
+  store volatile i32 %rpm_or_sys, ptr @globalInt, align 4
+  store volatile i32 %kkr, ptr @globalInt, align 4
+  ret void
+}
+
+; CHECK: storetrunci32_abs_global_2
+; CHECK-NOT:r[[REG:[0-9]+]] = ##globalInt
+; CHECK:memw(##globalInt) = r{{[0-9]+}}
+; CHECK:memw(##globalInt) = r{{[0-9]+}}
+
+define void @storetrunci32_abs_global_2(i64 %descr_addr, i32 %rpm_or_sys) #0 {
+entry:
+  %conv = trunc i64 %descr_addr to i32
+  store volatile i32 %conv, ptr @globalInt, align 4
+  store volatile i32 %rpm_or_sys, ptr @globalInt, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll b/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll
new file mode 100644
index 0000000..cebba94
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll
@@ -0,0 +1,26 @@
+; Check that after tfr-cleanup COPY to $r0 is converted to tfrsi instruction
+; The tfrst instruction must use the same slot index as the COPY instruction
+; to avoid breaking live interval information.
+; Check that there is no machine verifier crash
+
+; RUN: llc -stop-after=tfr-cleanup -verify-machineinstrs %s -o - | FileCheck %s
+
+; CHECK: $r0 = A2_tfrsi 34767
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nounwind optsize
+define dso_local i32 @foo() local_unnamed_addr #0 {
+entry:
+  call void @bar(i32 34767) #1
+  call void @baz(i32 34767) #1
+  ret i32 15
+}
+
+declare void @bar(i32) local_unnamed_addr
+
+declare void @baz(i32) local_unnamed_addr
+
+attributes #0 = { nounwind optsize "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" }
+attributes #1 = { noduplicate nomerge nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
new file mode 100644
index 0000000..fca42d5
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=hexagon -run-pass hexagon-postincopt -o - %s | FileCheck %s
+# REQUIRES: asserts
+
+# Check that this doesn't crash:
+# CHECK: L2_loadbsw4_io
+
+---
+name: fred
+tracksRegLiveness: true
+liveins:
+  - { reg: '$r0', virtual-reg: '%0' }
+body: |
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $r0
+
+    %0:intregs = COPY $r0
+    %1:intregs = A2_tfrsi 240
+    %2:doubleregs = IMPLICIT_DEF
+    %3:doubleregs = IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.1(0x80000000)
+
+    %4:intregs = PHI %1, %bb.0, %5, %bb.1
+    %6:doubleregs = L2_loadbsw4_io %4, 0
+    %7:doubleregs = M2_vrmac_s0 %2, %6, %3
+    S2_storeri_io %0, 0, %7.isub_lo
+    %5:intregs = nuw A2_addi %4, 256
+    J2_jump %bb.1, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
new file mode 100644
index 0000000..0945300
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s
+
+%Box = type [6 x i64]
+
+define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 dereferenceable(48) %b, i64 %i) {
+; CHECK-LABEL: box:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    slli.d $a2, $a1, 5
+; CHECK-NEXT:    alsl.d $a1, $a1, $a2, 4
+; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    add.d $a3, $a2, $a1
+; CHECK-NEXT:    ldx.d $a1, $a1, $a2
+; CHECK-NEXT:    st.d $a1, $a0, 0
+; CHECK-NEXT:    ld.d $a1, $a3, 40
+; CHECK-NEXT:    st.d $a1, $a0, 40
+; CHECK-NEXT:    ld.d $a1, $a3, 32
+; CHECK-NEXT:    st.d $a1, $a0, 32
+; CHECK-NEXT:    ld.d $a1, $a3, 24
+; CHECK-NEXT:    st.d $a1, $a0, 24
+; CHECK-NEXT:    ld.d $a1, $a3, 16
+; CHECK-NEXT:    st.d $a1, $a0, 16
+; CHECK-NEXT:    ori $a1, $a3, 8
+; CHECK-NEXT:    ld.d $a1, $a1, 0
+; CHECK-NEXT:    st.d $a1, $a0, 8
+; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ret
+  %1 = alloca [2 x %Box], align 16
+  %2 = getelementptr inbounds [2 x %Box], ptr %1, i64 0, i64 %i
+  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) %b, ptr noundef nonnull align 16 dereferenceable(48) %2, i64 48, i1 false)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index c14dc88..a29d4e1 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -1135,31 +1135,86 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0];
-; --- TODO
-; --- Unaligned parameter store/ return value load is broken in both nvcc
-; --- and llvm and needs to be fixed.
 ; CHECK:        .param .align 1 .b8 param0[25];
-; CHECK-DAG:        st.param.b32    [param0+0],
-; CHECK-DAG:        st.param.b32    [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+0],
+; CHECK-DAG:        st.param.b8     [param0+1],
+; CHECK-DAG:        st.param.b8     [param0+2],
+; CHECK-DAG:        st.param.b8     [param0+3],
+; CHECK-DAG:        st.param.b8     [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+5],
+; CHECK-DAG:        st.param.b8     [param0+6],
+; CHECK-DAG:        st.param.b8     [param0+7],
 ; CHECK-DAG:        st.param.b8     [param0+8],
-; CHECK-DAG:        st.param.b32    [param0+9],
-; CHECK-DAG:        st.param.b32    [param0+13],
-; CHECK-DAG:        st.param.b64    [param0+17],
+; CHECK-DAG:        st.param.b8     [param0+9],
+; CHECK-DAG:        st.param.b8     [param0+10],
+; CHECK-DAG:        st.param.b8     [param0+11],
+; CHECK-DAG:        st.param.b8     [param0+12],
+; CHECK-DAG:        st.param.b8     [param0+13],
+; CHECK-DAG:        st.param.b8     [param0+14],
+; CHECK-DAG:        st.param.b8     [param0+15],
+; CHECK-DAG:        st.param.b8     [param0+16],
+; CHECK-DAG:        st.param.b8     [param0+17],
+; CHECK-DAG:        st.param.b8     [param0+18],
+; CHECK-DAG:        st.param.b8     [param0+19],
+; CHECK-DAG:        st.param.b8     [param0+20],
+; CHECK-DAG:        st.param.b8     [param0+21],
+; CHECK-DAG:        st.param.b8     [param0+22],
+; CHECK-DAG:        st.param.b8     [param0+23],
+; CHECK-DAG:        st.param.b8     [param0+24],
 ; CHECK:            .param .align 1 .b8 retval0[25];
 ; CHECK:            call.uni (retval0),
 ; CHECK-NEXT:       test_s_i1i32x4p,
-; CHECK-DAG:        ld.param.b32    %r41, [retval0+0];
-; CHECK-DAG:        ld.param.b32    %r42, [retval0+4];
-; CHECK-DAG:        ld.param.b8     %rs2, [retval0+8];
-; CHECK-DAG:        ld.param.b32    %r43, [retval0+9];
-; CHECK-DAG:        ld.param.b32    %r44, [retval0+13];
-; CHECK-DAG:        ld.param.b64    %rd23, [retval0+17];
-; CHECK-DAG:        st.param.b32    [func_retval0+0],
-; CHECK-DAG:        st.param.b32    [func_retval0+4],
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+0];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+1];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+2];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+3];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+4];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+5];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+6];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+7];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+8];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+9];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+10];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+11];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+12];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+13];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+14];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+15];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+16];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+17];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+18];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+19];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+20];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+21];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+22];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+23];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+24];
+; CHECK:            } // callseq
+; CHECK-DAG:        st.param.b8     [func_retval0+0],
+; CHECK-DAG:        st.param.b8     [func_retval0+1],
+; CHECK-DAG:        st.param.b8     [func_retval0+2],
+; CHECK-DAG:        st.param.b8     [func_retval0+3],
+; CHECK-DAG:        st.param.b8     [func_retval0+4],
+; CHECK-DAG:        st.param.b8     [func_retval0+5],
+; CHECK-DAG:        st.param.b8     [func_retval0+6],
+; CHECK-DAG:        st.param.b8     [func_retval0+7],
 ; CHECK-DAG:        st.param.b8     [func_retval0+8],
-; CHECK-DAG:        st.param.b32    [func_retval0+9],
-; CHECK-DAG:        st.param.b32    [func_retval0+13],
-; CHECK-DAG:        st.param.b64    [func_retval0+17],
+; CHECK-DAG:        st.param.b8     [func_retval0+9],
+; CHECK-DAG:        st.param.b8     [func_retval0+10],
+; CHECK-DAG:        st.param.b8     [func_retval0+11],
+; CHECK-DAG:        st.param.b8     [func_retval0+12],
+; CHECK-DAG:        st.param.b8     [func_retval0+13],
+; CHECK-DAG:        st.param.b8     [func_retval0+14],
+; CHECK-DAG:        st.param.b8     [func_retval0+15],
+; CHECK-DAG:        st.param.b8     [func_retval0+16],
+; CHECK-DAG:        st.param.b8     [func_retval0+17],
+; CHECK-DAG:        st.param.b8     [func_retval0+18],
+; CHECK-DAG:        st.param.b8     [func_retval0+19],
+; CHECK-DAG:        st.param.b8     [func_retval0+20],
+; CHECK-DAG:        st.param.b8     [func_retval0+21],
+; CHECK-DAG:        st.param.b8     [func_retval0+22],
+; CHECK-DAG:        st.param.b8     [func_retval0+23],
+; CHECK-DAG:        st.param.b8     [func_retval0+24],
 
 define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
        %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
new file mode 100644
index 0000000..40a3e9e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -0,0 +1,385 @@
+; Verifies correctness of load/store of parameters and return values.
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %}
+
+%s_i8i16p = type { <{ i16, i8, i16 }>, i64 }
+%s_i8i32p = type { <{ i32, i8, i32 }>, i64 }
+%s_i8i64p = type { <{ i64, i8, i64 }>, i64 }
+%s_i8f16p = type { <{ half, i8, half }>, i64 }
+%s_i8f16x2p = type { <{ <2 x half>, i8, <2 x half> }>, i64 }
+%s_i8f32p = type { <{ float, i8, float }>, i64 }
+%s_i8f64p = type { <{ double, i8, double }>, i64 }
+
+; -- All loads/stores from parameters aligned by one must be done one
+;    byte at a time.
+; -- Notes:
+;   -- There are two fields of interest in the packed part of the struct, one
+;      with a proper offset and one without. The former should be loaded or
+;      stored as a whole, and the latter by bytes.
+;   -- Only loading and storing the said fields are checked in the following
+;      series of tests so that they are more concise.
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8i16p(
+; CHECK:        .param .align 8 .b8 test_s_i8i16p_param_0[16]
+; CHECK-DAG:    ld.param.u16 [[P0:%rs[0-9]+]],   [test_s_i8i16p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%rs[0-9]+]],   [test_s_i8i16p_param_0+3];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%rs[0-9]+]],   [test_s_i8i16p_param_0+4];
+; CHECK-DAG:    shl.b16     [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    or.b16      [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[16];
+; CHECK-DAG:    st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
+; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
+; CHECK:        .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i16p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+4];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG:    shl.b16      [[R2_1_shl:%rs[0-9]+]], [[R2_1]], 8;
+; CHECK-DAG:    and.b16      [[R2_0_and:%rs[0-9]+]], [[R2_0]], 255;
+; CHECK-DAG:    or.b16       [[R2:%rs[0-9]+]], [[R2_0_and]], [[R2_1_shl]];
+; CHECK-DAG:    st.param.b8  [func_retval0+3], [[R2]];
+; CHECK-DAG:    and.b16      [[R2_1_and:%rs[0-9]+]], [[R2_1]], 255;
+; CHECK-DAG:    st.param.b8  [func_retval0+4], [[R2_1_and]];
+; CHECK:        ret;
+
+define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
+       %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a)
+       ret %s_i8i16p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8i32p(
+; CHECK:        .param .align 8 .b8 test_s_i8i32p_param_0[24]
+; CHECK-DAG:    ld.param.u32 [[P0:%r[0-9]+]],   [test_s_i8i32p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%r[0-9]+]],   [test_s_i8i32p_param_0+5];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%r[0-9]+]],   [test_s_i8i32p_param_0+6];
+; CHECK-DAG:    ld.param.u8 [[P2_2:%r[0-9]+]],   [test_s_i8i32p_param_0+7];
+; CHECK-DAG:    ld.param.u8 [[P2_3:%r[0-9]+]],   [test_s_i8i32p_param_0+8];
+; CHECK-DAG:    shl.b32     [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32     [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32     [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32      [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32      [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32      [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32     [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32     [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i32p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
+       %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
+       ret %s_i8i32p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8i64p(
+; CHECK:        .param .align 8 .b8 test_s_i8i64p_param_0[32]
+; CHECK-DAG:    ld.param.u64 [[P0:%rd[0-9]+]],   [test_s_i8i64p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%rd[0-9]+]],   [test_s_i8i64p_param_0+9];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%rd[0-9]+]],   [test_s_i8i64p_param_0+10];
+; CHECK-DAG:    ld.param.u8 [[P2_2:%rd[0-9]+]],   [test_s_i8i64p_param_0+11];
+; CHECK-DAG:    ld.param.u8 [[P2_3:%rd[0-9]+]],   [test_s_i8i64p_param_0+12];
+; CHECK-DAG:    ld.param.u8 [[P2_4:%rd[0-9]+]],   [test_s_i8i64p_param_0+13];
+; CHECK-DAG:    ld.param.u8 [[P2_5:%rd[0-9]+]],   [test_s_i8i64p_param_0+14];
+; CHECK-DAG:    ld.param.u8 [[P2_6:%rd[0-9]+]],   [test_s_i8i64p_param_0+15];
+; CHECK-DAG:    ld.param.u8 [[P2_7:%rd[0-9]+]],   [test_s_i8i64p_param_0+16];
+; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b64       [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG:    shl.b64 	 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG:    shl.b64      [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG:    shl.b64      [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG:    or.b64       [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG:    shl.b64      [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG:    or.b64       [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG:    shr.u64      [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u64      [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG:    shr.u64      [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK-DAG:    st.param.b64 [param0+0],  [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+9],  [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+10], [[P2_shr_1]];
+; CHECK-DAG:    st.param.b8  [param0+11], [[P2_shr_2]];
+; CHECK-DAG:    st.param.b8  [param0+12], [[P2_shr_3]];
+; CHECK-DAG:    st.param.b8  [param0+13], [[P2_or_5]];
+; CHECK-DAG:    st.param.b8  [param0+14], [[P2_bfe_4]];
+; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
+; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i64p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b64 [[R0:%rd[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b8  [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG:    ld.param.b8  [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG:    ld.param.b8  [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG:    ld.param.b8  [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b64 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+9],
+; CHECK-DAG:    st.param.b8  [func_retval0+10],
+; CHECK-DAG:    st.param.b8  [func_retval0+11],
+; CHECK-DAG:    st.param.b8  [func_retval0+12],
+; CHECK-DAG:    st.param.b8  [func_retval0+13],
+; CHECK-DAG:    st.param.b8  [func_retval0+14],
+; CHECK-DAG:    st.param.b8  [func_retval0+15],
+; CHECK-DAG:    st.param.b8  [func_retval0+16],
+; CHECK:        ret;
+
+define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
+       %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a)
+       ret %s_i8i64p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8f16p(
+; CHECK:        .param .align 8 .b8 test_s_i8f16p_param_0[16]
+; CHECK-DAG:    ld.param.b16 [[P0:%rs[0-9]+]],     [test_s_i8f16p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%rs[0-9]+]],   [test_s_i8f16p_param_0+3];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%rs[0-9]+]],   [test_s_i8f16p_param_0+4];
+; CHECK-DAG:    shl.b16      [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    or.b16       [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[16];
+; CHECK-DAG:    st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
+; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
+; CHECK:        .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f16p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],     [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2I_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG:    ld.param.b8  [[R2I_1:%rs[0-9]+]], [retval0+4];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG:    shl.b16      [[R2I_1_shl:%rs[0-9]+]], [[R2I_1]], 8;
+; CHECK-DAG:    and.b16      [[R2I_0_and:%rs[0-9]+]], [[R2I_0]], 255;
+; CHECK-DAG:    or.b16       [[R2I:%rs[0-9]+]], [[R2I_0_and]], [[R2I_1_shl]];
+; CHECK-DAG:    st.param.b8  [func_retval0+3],  [[R2I]];
+; CHECK-DAG:    and.b16      [[R2I_1_and:%rs[0-9]+]], [[R2I_1]], 255;
+; CHECK-DAG:    st.param.b8  [func_retval0+4],  [[R2I_1_and]];
+; CHECK:        ret;
+
+define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
+       %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a)
+       ret %s_i8f16p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f16x2p(
+; CHECK:        .param .align 8 .b8 test_s_i8f16x2p_param_0[24]
+; CHECK-DAG:    ld.param.b32 [[P0:%r[0-9]+]],  [test_s_i8f16x2p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f16x2p_param_0+5];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f16x2p_param_0+6];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f16x2p_param_0+7];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f16x2p_param_0+8];
+; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32       [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32       [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32       [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32      [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32      [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f16x2p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
+       %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
+       ret %s_i8f16x2p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f32p(
+; CHECK:        .param .align 8 .b8 test_s_i8f32p_param_0[24]
+; CHECK-DAG:    ld.param.f32 [[P0:%f[0-9]+]],    [test_s_i8f32p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f32p_param_0+5];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f32p_param_0+6];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f32p_param_0+7];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f32p_param_0+8];
+; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32       [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32       [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32       [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32      [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32      [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.f32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f32p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.f32 [[R0:%f[0-9]+]],    [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.f32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
+       %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
+       ret %s_i8f32p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8f64p(
+; CHECK:        .param .align 8 .b8 test_s_i8f64p_param_0[32]
+; CHECK-DAG:    ld.param.f64 [[P0:%fd[0-9]+]],    [test_s_i8f64p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%rd[0-9]+]],   [test_s_i8f64p_param_0+9];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%rd[0-9]+]],   [test_s_i8f64p_param_0+10];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%rd[0-9]+]],   [test_s_i8f64p_param_0+11];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%rd[0-9]+]],   [test_s_i8f64p_param_0+12];
+; CHECK-DAG:    ld.param.u8  [[P2_4:%rd[0-9]+]],   [test_s_i8f64p_param_0+13];
+; CHECK-DAG:    ld.param.u8  [[P2_5:%rd[0-9]+]],   [test_s_i8f64p_param_0+14];
+; CHECK-DAG:    ld.param.u8  [[P2_6:%rd[0-9]+]],   [test_s_i8f64p_param_0+15];
+; CHECK-DAG:    ld.param.u8  [[P2_7:%rd[0-9]+]],   [test_s_i8f64p_param_0+16];
+; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b64       [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG:    shl.b64 	 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG:    shl.b64      [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG:    shl.b64      [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG:    or.b64       [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG:    shl.b64      [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG:    or.b64       [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG:    shr.u64      [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u64      [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG:    shr.u64      [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK-DAG:    st.param.f64 [param0+0],  [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+9],  [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+10], [[P2_shr_1]];
+; CHECK-DAG:    st.param.b8  [param0+11], [[P2_shr_2]];
+; CHECK-DAG:    st.param.b8  [param0+12], [[P2_shr_3]];
+; CHECK-DAG:    st.param.b8  [param0+13], [[P2_or_5]];
+; CHECK-DAG:    st.param.b8  [param0+14], [[P2_bfe_4]];
+; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
+; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f64p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.f64 [[R0:%fd[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b8  [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG:    ld.param.b8  [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG:    ld.param.b8  [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG:    ld.param.b8  [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.f64 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+9],
+; CHECK-DAG:    st.param.b8  [func_retval0+10],
+; CHECK-DAG:    st.param.b8  [func_retval0+11],
+; CHECK-DAG:    st.param.b8  [func_retval0+12],
+; CHECK-DAG:    st.param.b8  [func_retval0+13],
+; CHECK-DAG:    st.param.b8  [func_retval0+14],
+; CHECK-DAG:    st.param.b8  [func_retval0+15],
+; CHECK-DAG:    st.param.b8  [func_retval0+16],
+; CHECK:        ret;
+
+define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
+       %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
+       ret %s_i8f64p %r
+}
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 91e7399..3c2e846 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I %s
+; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs -code-model=medium < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I-MEDIUM %s
+; RUN:   | FileCheck -check-prefix=RV32I-MEDIUM %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I %s
+; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs -code-model=medium < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I-MEDIUM %s
+; RUN:   | FileCheck -check-prefix=RV64I-MEDIUM %s
 
 ; We can often fold an ADDI into the offset of load/store instructions:
 ;   (load (addi base, off1), off2) -> (load base, off1+off2)
diff --git a/llvm/test/CodeGen/RISCV/make-compressible.mir b/llvm/test/CodeGen/RISCV/make-compressible.mir
index 2105a13..03da38a 100644
--- a/llvm/test/CodeGen/RISCV/make-compressible.mir
+++ b/llvm/test/CodeGen/RISCV/make-compressible.mir
@@ -1,8 +1,14 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -o - %s -mtriple=riscv32 -mattr=+c,+f,+d -simplify-mir \
-# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefix=RV32 %s
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32C %s
 # RUN: llc -o - %s -mtriple=riscv64 -mattr=+c,+f,+d -simplify-mir \
-# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefix=RV64 %s
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV64,RV64C %s
+# RUN: llc -o - %s -mtriple=riscv32 -mattr=+d,+zcf -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32ZCF %s
+# RUN: llc -o - %s -mtriple=riscv32 -mattr=+d,+zca -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32ZCA %s
+# RUN: llc -o - %s -mtriple=riscv64 -mattr=+d,+zca -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV64,RV64ZCA %s
 --- |
 
   define void @store_common_value(ptr %a, ptr %b, ptr %c) #0 {
@@ -288,7 +294,7 @@
     ret { double, double } %3
   }
 
-  attributes #0 = { minsize "target-features"="+c,+f,+d" }
+  attributes #0 = { minsize }
 
 ...
 ---
@@ -306,6 +312,7 @@ body:             |
     ; RV32-NEXT: SW $x13, killed renamable $x11, 0 :: (store (s32) into %ir.b)
     ; RV32-NEXT: SW $x13, killed renamable $x12, 0 :: (store (s32) into %ir.c)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value
     ; RV64: liveins: $x10, $x11, $x12
     ; RV64-NEXT: {{  $}}
@@ -327,14 +334,15 @@ body:             |
   bb.0.entry:
     liveins: $x10, $x11, $x12, $f16_f
 
-    ; RV32-LABEL: name: store_common_value_float
-    ; RV32: liveins: $x10, $x11, $x12, $f16_f
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f
-    ; RV32-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
-    ; RV32-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
-    ; RV32-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
-    ; RV32-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_value_float
+    ; RV32C: liveins: $x10, $x11, $x12, $f16_f
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f
+    ; RV32C-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
+    ; RV32C-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
+    ; RV32C-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
+    ; RV32C-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_float
     ; RV64: liveins: $x10, $x11, $x12, $f16_f
     ; RV64-NEXT: {{  $}}
@@ -342,6 +350,23 @@ body:             |
     ; RV64-NEXT: FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
     ; RV64-NEXT: FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
     ; RV64-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_value_float
+    ; RV32ZCF: liveins: $x10, $x11, $x12, $f16_f
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f
+    ; RV32ZCF-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
+    ; RV32ZCF-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
+    ; RV32ZCF-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_value_float
+    ; RV32ZCA: liveins: $x10, $x11, $x12, $f16_f
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSW renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
+    ; RV32ZCA-NEXT: FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
+    ; RV32ZCA-NEXT: FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
+    ; RV32ZCA-NEXT: PseudoRET
     FSW renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
     FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
     FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
@@ -355,22 +380,47 @@ body:             |
   bb.0.entry:
     liveins: $x10, $x11, $x12, $f16_d
 
-    ; RV32-LABEL: name: store_common_value_double
-    ; RV32: liveins: $x10, $x11, $x12, $f16_d
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
-    ; RV32-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
-    ; RV32-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
-    ; RV32-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
-    ; RV32-NEXT: PseudoRET
-    ; RV64-LABEL: name: store_common_value_double
-    ; RV64: liveins: $x10, $x11, $x12, $f16_d
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
-    ; RV64-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
-    ; RV64-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
-    ; RV64-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
-    ; RV64-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_value_double
+    ; RV32C: liveins: $x10, $x11, $x12, $f16_d
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
+    ; RV32C-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV32C-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV32C-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV32C-NEXT: PseudoRET
+    ;
+    ; RV64C-LABEL: name: store_common_value_double
+    ; RV64C: liveins: $x10, $x11, $x12, $f16_d
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
+    ; RV64C-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV64C-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV64C-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV64C-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_value_double
+    ; RV32ZCF: liveins: $x10, $x11, $x12, $f16_d
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV32ZCF-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV32ZCF-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_value_double
+    ; RV32ZCA: liveins: $x10, $x11, $x12, $f16_d
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV32ZCA-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV32ZCA-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV32ZCA-NEXT: PseudoRET
+    ;
+    ; RV64ZCA-LABEL: name: store_common_value_double
+    ; RV64ZCA: liveins: $x10, $x11, $x12, $f16_d
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV64ZCA-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV64ZCA-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV64ZCA-NEXT: PseudoRET
     FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
     FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
     FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
@@ -395,6 +445,7 @@ body:             |
     ; RV32-NEXT: renamable $x10 = ADDI $x0, 5
     ; RV32-NEXT: SW killed renamable $x10, killed $x11, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -432,6 +483,7 @@ body:             |
     ; RV32-NEXT: SW killed renamable $x10, $x11, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: SW killed $x11, $x11, 0 :: (volatile store (s32) into %ir.q)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_self
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -457,14 +509,15 @@ body:             |
   bb.0.entry:
     liveins: $x16, $f10_f, $f11_f, $f12_f
 
-    ; RV32-LABEL: name: store_common_ptr_float
-    ; RV32: liveins: $x16, $f10_f, $f11_f, $f12_f
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p)
-    ; RV32-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p)
-    ; RV32-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p)
-    ; RV32-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_ptr_float
+    ; RV32C: liveins: $x16, $f10_f, $f11_f, $f12_f
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32C-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32C-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32C-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_float
     ; RV64: liveins: $x16, $f10_f, $f11_f, $f12_f
     ; RV64-NEXT: {{  $}}
@@ -472,6 +525,23 @@ body:             |
     ; RV64-NEXT: FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV64-NEXT: FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV64-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_ptr_float
+    ; RV32ZCF: liveins: $x16, $f10_f, $f11_f, $f12_f
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x10 = ADDI $x16, 0
+    ; RV32ZCF-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCF-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCF-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_ptr_float
+    ; RV32ZCA: liveins: $x16, $f10_f, $f11_f, $f12_f
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSW killed renamable $f10_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCA-NEXT: FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCA-NEXT: FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCA-NEXT: PseudoRET
     FSW killed renamable $f10_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
@@ -485,22 +555,47 @@ body:             |
   bb.0.entry:
     liveins: $x16, $f10_d, $f11_d, $f12_d
 
-    ; RV32-LABEL: name: store_common_ptr_double
-    ; RV32: liveins: $x16, $f10_d, $f11_d, $f12_d
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV32-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV32-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV32-NEXT: PseudoRET
-    ; RV64-LABEL: name: store_common_ptr_double
-    ; RV64: liveins: $x16, $f10_d, $f11_d, $f12_d
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x10 = ADDI $x16, 0
-    ; RV64-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV64-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV64-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV64-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_ptr_double
+    ; RV32C: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32C-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32C-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32C-NEXT: PseudoRET
+    ;
+    ; RV64C-LABEL: name: store_common_ptr_double
+    ; RV64C: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x10 = ADDI $x16, 0
+    ; RV64C-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64C-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64C-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64C-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_ptr_double
+    ; RV32ZCF: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCF-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCF-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_ptr_double
+    ; RV32ZCA: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCA-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCA-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCA-NEXT: PseudoRET
+    ;
+    ; RV64ZCA-LABEL: name: store_common_ptr_double
+    ; RV64ZCA: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64ZCA-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64ZCA-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64ZCA-NEXT: PseudoRET
     FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
     FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
     FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
@@ -522,6 +617,7 @@ body:             |
     ; RV32-NEXT: dead renamable $x10 = LW $x11, 0 :: (volatile load (s32) from %ir.p)
     ; RV32-NEXT: dead renamable $x10 = LW killed $x11, 0 :: (volatile load (s32) from %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_common_ptr
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -543,14 +639,15 @@ body:             |
   bb.0.entry:
     liveins: $x16
 
-    ; RV32-LABEL: name: load_common_ptr_float
-    ; RV32: liveins: $x16
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g)
-    ; RV32-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ; RV32C-LABEL: name: load_common_ptr_float
+    ; RV32C: liveins: $x16
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g)
+    ; RV32C-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
     ; RV64-LABEL: name: load_common_ptr_float
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -558,6 +655,23 @@ body:             |
     ; RV64-NEXT: renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1)
     ; RV64-NEXT: renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2)
     ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCF-LABEL: name: load_common_ptr_float
+    ; RV32ZCF: liveins: $x16
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x10 = ADDI $x16, 0
+    ; RV32ZCF-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g)
+    ; RV32ZCF-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCA-LABEL: name: load_common_ptr_float
+    ; RV32ZCA: liveins: $x16
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_f = FLW renamable $x16, 0 :: (load (s32) from %ir.g)
+    ; RV32ZCA-NEXT: renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
     renamable $f10_f = FLW renamable $x16, 0 :: (load (s32) from %ir.g)
     renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1)
     renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2)
@@ -571,22 +685,47 @@ body:             |
   bb.0.entry:
     liveins: $x16
 
-    ; RV32-LABEL: name: load_common_ptr_double
-    ; RV32: liveins: $x16
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
-    ; RV32-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
-    ; RV64-LABEL: name: load_common_ptr_double
-    ; RV64: liveins: $x16
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x10 = ADDI $x16, 0
-    ; RV64-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
-    ; RV64-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
-    ; RV64-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
-    ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ; RV32C-LABEL: name: load_common_ptr_double
+    ; RV32C: liveins: $x16
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
+    ; RV32C-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64C-LABEL: name: load_common_ptr_double
+    ; RV64C: liveins: $x16
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x10 = ADDI $x16, 0
+    ; RV64C-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
+    ; RV64C-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV64C-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV64C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCF-LABEL: name: load_common_ptr_double
+    ; RV32ZCF: liveins: $x16
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
+    ; RV32ZCF-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCA-LABEL: name: load_common_ptr_double
+    ; RV32ZCA: liveins: $x16
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
+    ; RV32ZCA-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64ZCA-LABEL: name: load_common_ptr_double
+    ; RV64ZCA: liveins: $x16
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
+    ; RV64ZCA-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV64ZCA-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV64ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
     renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
     renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
     renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
@@ -613,6 +752,7 @@ body:             |
     ; RV32-NEXT: renamable $x11 = ADDI $x0, 7
     ; RV32-NEXT: SW killed renamable $x11, killed $x12, 28 :: (volatile store (s32) into %ir.3)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -644,15 +784,16 @@ body:             |
   bb.0.entry:
     liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
 
-    ; RV32-LABEL: name: store_large_offset_float
-    ; RV32: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 384
-    ; RV32-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0)
-    ; RV32-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1)
-    ; RV32-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2)
-    ; RV32-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3)
-    ; RV32-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_large_offset_float
+    ; RV32C: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 384
+    ; RV32C-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0)
+    ; RV32C-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1)
+    ; RV32C-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2)
+    ; RV32C-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3)
+    ; RV32C-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_float
     ; RV64: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
     ; RV64-NEXT: {{  $}}
@@ -661,6 +802,25 @@ body:             |
     ; RV64-NEXT: FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2)
     ; RV64-NEXT: FSW killed renamable $f13_f, killed renamable $x10, 412 :: (volatile store (s32) into %ir.3)
     ; RV64-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_large_offset_float
+    ; RV32ZCF: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x11 = ADDI $x10, 384
+    ; RV32ZCF-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0)
+    ; RV32ZCF-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1)
+    ; RV32ZCF-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2)
+    ; RV32ZCF-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_large_offset_float
+    ; RV32ZCA: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0)
+    ; RV32ZCA-NEXT: FSW killed renamable $f11_f, renamable $x10, 404 :: (volatile store (s32) into %ir.1)
+    ; RV32ZCA-NEXT: FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2)
+    ; RV32ZCA-NEXT: FSW killed renamable $f13_f, killed renamable $x10, 412 :: (volatile store (s32) into %ir.3)
+    ; RV32ZCA-NEXT: PseudoRET
     FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0)
     FSW killed renamable $f11_f, renamable $x10, 404 :: (volatile store (s32) into %ir.1)
     FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2)
@@ -675,24 +835,52 @@ body:             |
   bb.0.entry:
     liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
 
-    ; RV32-LABEL: name: store_large_offset_double
-    ; RV32: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 768
-    ; RV32-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
-    ; RV32-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
-    ; RV32-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
-    ; RV32-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
-    ; RV32-NEXT: PseudoRET
-    ; RV64-LABEL: name: store_large_offset_double
-    ; RV64: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x11 = ADDI $x10, 768
-    ; RV64-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
-    ; RV64-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
-    ; RV64-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
-    ; RV64-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
-    ; RV64-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_large_offset_double
+    ; RV32C: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 768
+    ; RV32C-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
+    ; RV32C-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
+    ; RV32C-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
+    ; RV32C-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
+    ; RV32C-NEXT: PseudoRET
+    ;
+    ; RV64C-LABEL: name: store_large_offset_double
+    ; RV64C: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x11 = ADDI $x10, 768
+    ; RV64C-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
+    ; RV64C-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
+    ; RV64C-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
+    ; RV64C-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
+    ; RV64C-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_large_offset_double
+    ; RV32ZCF: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
+    ; RV32ZCF-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
+    ; RV32ZCF-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
+    ; RV32ZCF-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_large_offset_double
+    ; RV32ZCA: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
+    ; RV32ZCA-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
+    ; RV32ZCA-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
+    ; RV32ZCA-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3)
+    ; RV32ZCA-NEXT: PseudoRET
+    ;
+    ; RV64ZCA-LABEL: name: store_large_offset_double
+    ; RV64ZCA: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
+    ; RV64ZCA-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
+    ; RV64ZCA-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
+    ; RV64ZCA-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3)
+    ; RV64ZCA-NEXT: PseudoRET
     FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
     FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
     FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
@@ -716,6 +904,7 @@ body:             |
     ; RV32-NEXT: dead renamable $x11 = LW $x12, 24 :: (volatile load (s32) from %ir.2)
     ; RV32-NEXT: dead renamable $x10 = LW killed $x12, 28 :: (volatile load (s32) from %ir.3)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_large_offset
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -739,14 +928,15 @@ body:             |
   bb.0.entry:
     liveins: $x10
 
-    ; RV32-LABEL: name: load_large_offset_float
-    ; RV32: liveins: $x10
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 384
-    ; RV32-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx)
-    ; RV32-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ; RV32C-LABEL: name: load_large_offset_float
+    ; RV32C: liveins: $x10
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 384
+    ; RV32C-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx)
+    ; RV32C-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
     ; RV64-LABEL: name: load_large_offset_float
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -754,6 +944,23 @@ body:             |
     ; RV64-NEXT: renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
     ; RV64-NEXT: renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2)
     ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCF-LABEL: name: load_large_offset_float
+    ; RV32ZCF: liveins: $x10
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x11 = ADDI $x10, 384
+    ; RV32ZCF-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx)
+    ; RV32ZCF-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCA-LABEL: name: load_large_offset_float
+    ; RV32ZCA: liveins: $x10
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx)
+    ; RV32ZCA-NEXT: renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
     renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx)
     renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
     renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2)
@@ -767,22 +974,47 @@ body:             |
   bb.0.entry:
     liveins: $x10
 
-    ; RV32-LABEL: name: load_large_offset_double
-    ; RV32: liveins: $x10
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 768
-    ; RV32-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
-    ; RV32-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
-    ; RV64-LABEL: name: load_large_offset_double
-    ; RV64: liveins: $x10
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x11 = ADDI $x10, 768
-    ; RV64-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
-    ; RV64-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
-    ; RV64-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
-    ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ; RV32C-LABEL: name: load_large_offset_double
+    ; RV32C: liveins: $x10
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 768
+    ; RV32C-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
+    ; RV32C-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64C-LABEL: name: load_large_offset_double
+    ; RV64C: liveins: $x10
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x11 = ADDI $x10, 768
+    ; RV64C-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
+    ; RV64C-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
+    ; RV64C-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
+    ; RV64C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCF-LABEL: name: load_large_offset_double
+    ; RV32ZCF: liveins: $x10
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
+    ; RV32ZCF-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCA-LABEL: name: load_large_offset_double
+    ; RV32ZCA: liveins: $x10
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
+    ; RV32ZCA-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64ZCA-LABEL: name: load_large_offset_double
+    ; RV64ZCA: liveins: $x10
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
+    ; RV64ZCA-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
+    ; RV64ZCA-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
+    ; RV64ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
     renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
     renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
     renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
@@ -801,6 +1033,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: SW $x0, killed renamable $x10, 0 :: (store (s32) into %ir.a)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -822,6 +1055,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSW killed renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_float_no_opt
     ; RV64: liveins: $x10, $f16_f
     ; RV64-NEXT: {{  $}}
@@ -843,6 +1077,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSD killed renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_double_no_opt
     ; RV64: liveins: $x10, $f16_d
     ; RV64-NEXT: {{  $}}
@@ -865,6 +1100,7 @@ body:             |
     ; RV32-NEXT: renamable $x10 = ADDI $x0, 1
     ; RV32-NEXT: SW killed renamable $x10, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -888,6 +1124,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSW killed renamable $f10_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_float_no_opt
     ; RV64: liveins: $x16, $f10_f
     ; RV64-NEXT: {{  $}}
@@ -909,6 +1146,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSD killed renamable $f10_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_double_no_opt
     ; RV64: liveins: $x16, $f10_d
     ; RV64-NEXT: {{  $}}
@@ -930,6 +1168,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: dead renamable $x10 = LW killed renamable $x16, 0 :: (volatile load (s32) from %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_common_ptr_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -951,6 +1190,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: renamable $f10_f = FLW killed renamable $x16, 0 :: (load (s32) from %ir.g)
     ; RV32-NEXT: PseudoRET implicit $f10_f
+    ;
     ; RV64-LABEL: name: load_common_ptr_float_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -972,6 +1212,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: renamable $f10_d = FLD killed renamable $x16, 0 :: (load (s64) from %ir.g)
     ; RV32-NEXT: PseudoRET implicit $f10_d
+    ;
     ; RV64-LABEL: name: load_common_ptr_double_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -996,6 +1237,7 @@ body:             |
     ; RV32-NEXT: renamable $x11 = ADDI $x0, 3
     ; RV32-NEXT: SW killed renamable $x11, killed renamable $x10, 404 :: (volatile store (s32) into %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -1024,6 +1266,7 @@ body:             |
     ; RV32-NEXT: FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0)
     ; RV32-NEXT: FSW killed renamable $f11_f, killed renamable $x10, 404 :: (volatile store (s32) into %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_float_no_opt
     ; RV64: liveins: $x10, $f10_f, $f11_f
     ; RV64-NEXT: {{  $}}
@@ -1048,6 +1291,7 @@ body:             |
     ; RV32-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
     ; RV32-NEXT: FSD killed renamable $f11_d, killed renamable $x10, 808 :: (volatile store (s64) into %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_double_no_opt
     ; RV64: liveins: $x10, $f10_d, $f11_d
     ; RV64-NEXT: {{  $}}
@@ -1072,6 +1316,7 @@ body:             |
     ; RV32-NEXT: dead renamable $x11 = LW renamable $x10, 400 :: (volatile load (s32) from %ir.0)
     ; RV32-NEXT: dead renamable $x10 = LW killed renamable $x10, 404 :: (volatile load (s32) from %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_large_offset_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -1096,6 +1341,7 @@ body:             |
     ; RV32-NEXT: renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx)
     ; RV32-NEXT: renamable $f11_f = FLW killed renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
     ; RV32-NEXT: PseudoRET implicit $f10_f, implicit $f11_f
+    ;
     ; RV64-LABEL: name: load_large_offset_float_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -1120,6 +1366,7 @@ body:             |
     ; RV32-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
     ; RV32-NEXT: renamable $f11_d = FLD killed renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
     ; RV32-NEXT: PseudoRET implicit $f10_d, implicit $f11_d
+    ;
     ; RV64-LABEL: name: load_large_offset_double_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll
index f54031a..8aa145f 100644
--- a/llvm/test/CodeGen/RISCV/pr51206.ll
+++ b/llvm/test/CodeGen/RISCV/pr51206.ll
@@ -27,16 +27,12 @@ define signext i32 @wobble() nounwind {
 ; CHECK-NEXT:    lui a2, %hi(global.3)
 ; CHECK-NEXT:    li a3, 5
 ; CHECK-NEXT:    sw a1, %lo(global.3)(a2)
-; CHECK-NEXT:    bltu a0, a3, .LBB0_2
-; CHECK-NEXT:  # %bb.1: # %bb10
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    call quux
-; CHECK-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:  .LBB0_2: # %bb12
+; CHECK-NEXT:    bgeu a0, a3, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %bb12
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: # %bb10
+; CHECK-NEXT:    tail quux
 bb:
   %tmp = load i8, ptr @global, align 1
   %tmp1 = zext i8 %tmp to i32
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 1724b48..a09ab3e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=ilp32d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=lp64d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F
 
 declare <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i8>)
 
@@ -15086,31 +15086,27 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
   ret <32 x i64> %x
 }
 
-; FIXME: This is a miscompile triggered by the mgather ->
-; riscv.masked.strided.load combine. In order for it to trigger we need either a
-; strided gather that RISCVGatherScatterLowering doesn't pick up, or a new
-; strided gather generated by the widening sew combine.
 define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
 ; RV32V-LABEL: masked_gather_widen_sew_negative_stride:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi a0, a0, -128
-; RV32V-NEXT:    li a1, -128
+; RV32V-NEXT:    addi a0, a0, 136
+; RV32V-NEXT:    li a1, -136
 ; RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32V-NEXT:    vlse64.v v8, (a0), a1
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: masked_gather_widen_sew_negative_stride:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    addi a0, a0, -128
-; RV64V-NEXT:    li a1, -128
+; RV64V-NEXT:    addi a0, a0, 136
+; RV64V-NEXT:    li a1, -136
 ; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64V-NEXT:    vlse64.v v8, (a0), a1
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: masked_gather_widen_sew_negative_stride:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lui a1, 16392
-; RV32ZVE32F-NEXT:    addi a1, a1, 1152
+; RV32ZVE32F-NEXT:    lui a1, 16393
+; RV32ZVE32F-NEXT:    addi a1, a1, -888
 ; RV32ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.s.x v9, a1
 ; RV32ZVE32F-NEXT:    vluxei8.v v8, (a0), v9
@@ -15118,8 +15114,8 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: masked_gather_widen_sew_negative_stride:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 128
-; RV64ZVE32F-NEXT:    lw a2, 132(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 136
+; RV64ZVE32F-NEXT:    lw a2, 140(a0)
 ; RV64ZVE32F-NEXT:    lw a3, 0(a0)
 ; RV64ZVE32F-NEXT:    lw a0, 4(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
@@ -15128,7 +15124,7 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    ret
-  %ptrs = getelementptr i32, ptr %base, <4 x i64> <i64 32, i64 33, i64 0, i64 1>
+  %ptrs = getelementptr i32, ptr %base, <4 x i64> <i64 34, i64 35, i64 0, i64 1>
   %x = call <4 x i32> @llvm.masked.gather.v4i32.v32p0(<4 x ptr> %ptrs, i32 8, <4 x i1> shufflevector(<4 x i1> insertelement(<4 x i1> poison, i1 true, i32 0), <4 x i1> poison, <4 x i32> zeroinitializer), <4 x i32> poison)
   ret <4 x i32> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
new file mode 100644
index 0000000..6c5dd04
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -0,0 +1,1701 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.sadd.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vsadd_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vsra.vi v9, v9, 1
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.sadd.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vsadd_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vsadd_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vsadd_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vsadd_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vsadd_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vsadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vsadd_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsadd_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vsadd_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsadd_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vsadd_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vsadd_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vsadd_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vsadd_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vsadd_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vsadd_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vsadd_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vsadd_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vsadd_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vsadd_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vsadd_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vsadd_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v16, v16, -1
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vsadd.vi on RV32.
+
+define <32 x i64> @vsadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsadd_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsadd_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
new file mode 100644
index 0000000..6227f8a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -0,0 +1,1697 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.uadd.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vsaddu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.uadd.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vsaddu_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vsaddu_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vsaddu_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vsaddu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vsaddu_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vsaddu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vsaddu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsaddu_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vsaddu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsaddu_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vsaddu_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vsaddu_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vsaddu_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vsaddu_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vsaddu_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vsaddu_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vsaddu_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vsaddu_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vsaddu_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vsaddu_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vsaddu_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vsaddu_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v16, v16, -1
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vsaddu.vi on RV32.
+
+define <32 x i64> @vsaddu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsaddu_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsaddu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsaddu_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
new file mode 100644
index 0000000..6360cf4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -0,0 +1,1745 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.ssub.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vssub_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vsra.vi v9, v9, 1
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.ssub.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vssub_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vssub_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vssub_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vssub_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vssub_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vssub_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vssub_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssub_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vssub_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssub_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vssub_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vssub_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vssub_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vssub_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vssub_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vssub_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vssub_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vssub_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vssub_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vssub_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vssub_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vssub_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a2, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssub.vx v16, v16, a2, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a2
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v16, v16, a2
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vssub.vi on RV32.
+
+define <32 x i64> @vssub_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssub_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssub.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssub_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssub_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssub.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
new file mode 100644
index 0000000..6ea9758
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -0,0 +1,1740 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.usub.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vssubu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.usub.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vssubu_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vssubu_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vssubu_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vssubu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vssubu_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vssubu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vssubu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssubu_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vssubu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssubu_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vssubu_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vssubu_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vssubu_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vssubu_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vssubu_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vssubu_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vssubu_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vssubu_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vssubu_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vssubu_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vssubu_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vssubu_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a2, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssubu.vx v16, v16, a2, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a2
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v16, v16, a2
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vssubu.vi on RV32.
+
+define <32 x i64> @vssubu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssubu_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssubu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssubu_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index 574c265..a084b53 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -385,12 +385,12 @@ define <32 x i64> @vwsubu_v32i64(ptr %x, ptr %y) nounwind {
 define <2 x i32> @vwsubu_v2i32_v2i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i32_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i8>, ptr %x
   %b = load <2 x i8>, ptr %y
@@ -899,12 +899,12 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i32_of_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i8>, ptr %x
   %b = load <2 x i8>, ptr %y
@@ -917,12 +917,12 @@ define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) {
 define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i64_of_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vzext.vf4 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i8>, ptr %x
   %b = load <2 x i8>, ptr %y
@@ -935,12 +935,12 @@ define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) {
 define <2 x i64> @vwsubu_v2i64_of_v2i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i64_of_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vle16.v v9, (a1)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i16>, ptr %x
   %b = load <2 x i16>, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
new file mode 100644
index 0000000..caaeae5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -0,0 +1,2015 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.sadd.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vsadd_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.sadd.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vsadd_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vsadd_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vsadd_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vsadd_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vsadd_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vsadd_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vsadd_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vsadd_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vsadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub a2, a1, a0
+; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a0, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vsadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vsadd_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vsadd_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vsadd_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vsadd_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vsadd_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vsadd_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vsadd_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vsadd_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vsadd_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vsadd_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vsadd_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a0, a1, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vsadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vsadd_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vsadd_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vsadd_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vsadd_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
new file mode 100644
index 0000000..c0779e5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -0,0 +1,2014 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.uadd.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vsaddu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v9, v9, a2
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.uadd.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vsaddu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vsaddu_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vsaddu_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vsaddu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vsaddu_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vsaddu_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vsaddu_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vsaddu_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vsaddu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub a2, a1, a0
+; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a0, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vsaddu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vsaddu_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vsaddu_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vsaddu_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vsaddu_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vsaddu_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vsaddu_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vsaddu_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vsaddu_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vsaddu_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vsaddu_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vsaddu_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a0, a1, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vsaddu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vsaddu_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vsaddu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vsaddu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vsaddu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
new file mode 100644
index 0000000..2d51a2e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -0,0 +1,2067 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.ssub.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vssub_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.ssub.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vssub_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vssub_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vssub_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vssub_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vssub_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vssub_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vssub_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vssub_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vssub_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub a0, a1, a2
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vssub_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vssub_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vssub_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vssub_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vssub_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vssub_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vssub_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vssub_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vssub_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vssub_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vssub_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vssub_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    sub a1, a0, a2
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a1
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    bltu a0, a2, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vssub_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vssub_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vssub_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vssub_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vssub_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
new file mode 100644
index 0000000..e5589ce
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -0,0 +1,2065 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.usub.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vssubu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v9, v9, a2
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.usub.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vssubu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vssubu_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vssubu_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vssubu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vssubu_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vssubu_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vssubu_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vssubu_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vssubu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub a0, a1, a2
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vssubu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vssubu_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vssubu_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vssubu_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vssubu_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vssubu_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vssubu_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vssubu_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vssubu_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vssubu_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vssubu_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vssubu_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    sub a1, a0, a2
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a1
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    bltu a0, a2, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vssubu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vssubu_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vssubu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vssubu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vssubu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index e01984b..e07e520 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1606,23 +1606,255 @@ define i32 @select_cst_unknown(i32 signext %a, i32 signext %b) {
 ; RV64IMXVTCONDOPS-LABEL: select_cst_unknown:
 ; RV64IMXVTCONDOPS:       # %bb.0:
 ; RV64IMXVTCONDOPS-NEXT:    slt a0, a0, a1
-; RV64IMXVTCONDOPS-NEXT:    li a1, -7
-; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a1, a1, a0
-; RV64IMXVTCONDOPS-NEXT:    li a2, 5
-; RV64IMXVTCONDOPS-NEXT:    vt.maskc a0, a2, a0
-; RV64IMXVTCONDOPS-NEXT:    or a0, a0, a1
+; RV64IMXVTCONDOPS-NEXT:    li a1, -12
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 5
 ; RV64IMXVTCONDOPS-NEXT:    ret
 ;
 ; CHECKZICOND-LABEL: select_cst_unknown:
 ; CHECKZICOND:       # %bb.0:
 ; CHECKZICOND-NEXT:    slt a0, a0, a1
-; CHECKZICOND-NEXT:    li a1, -7
-; CHECKZICOND-NEXT:    czero.nez a1, a1, a0
-; CHECKZICOND-NEXT:    li a2, 5
-; CHECKZICOND-NEXT:    czero.eqz a0, a2, a0
-; CHECKZICOND-NEXT:    or a0, a0, a1
+; CHECKZICOND-NEXT:    li a1, -12
+; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 5
 ; CHECKZICOND-NEXT:    ret
   %cond = icmp slt i32 %a, %b
   %ret = select i1 %cond, i32 5, i32 -7
   ret i32 %ret
 }
+
+define i32 @select_cst1(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst1:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    mv a1, a0
+; RV32IM-NEXT:    li a0, 10
+; RV32IM-NEXT:    bnez a1, .LBB43_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    li a0, 20
+; RV32IM-NEXT:  .LBB43_2:
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst1:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    mv a1, a0
+; RV64IM-NEXT:    li a0, 10
+; RV64IM-NEXT:    bnez a1, .LBB43_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    li a0, 20
+; RV64IM-NEXT:  .LBB43_2:
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst1:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    li a1, 10
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 10
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; CHECKZICOND-LABEL: select_cst1:
+; CHECKZICOND:       # %bb.0:
+; CHECKZICOND-NEXT:    li a1, 10
+; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 10
+; CHECKZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 10, i32 20
+  ret i32 %ret
+}
+
+define i32 @select_cst2(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst2:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    mv a1, a0
+; RV32IM-NEXT:    li a0, 10
+; RV32IM-NEXT:    bnez a1, .LBB44_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, 5
+; RV32IM-NEXT:    addi a0, a0, -480
+; RV32IM-NEXT:  .LBB44_2:
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    mv a1, a0
+; RV64IM-NEXT:    li a0, 10
+; RV64IM-NEXT:    bnez a1, .LBB44_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, 5
+; RV64IM-NEXT:    addiw a0, a0, -480
+; RV64IM-NEXT:  .LBB44_2:
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst2:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    lui a1, 5
+; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, -490
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 10
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_cst2:
+; RV32IMZICOND:       # %bb.0:
+; RV32IMZICOND-NEXT:    lui a1, 5
+; RV32IMZICOND-NEXT:    addi a1, a1, -490
+; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    addi a0, a0, 10
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_cst2:
+; RV64IMZICOND:       # %bb.0:
+; RV64IMZICOND-NEXT:    lui a1, 5
+; RV64IMZICOND-NEXT:    addiw a1, a1, -490
+; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    addi a0, a0, 10
+; RV64IMZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 10, i32 20000
+  ret i32 %ret
+}
+
+define i32 @select_cst3(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst3:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    bnez a0, .LBB45_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, 5
+; RV32IM-NEXT:    addi a0, a0, -480
+; RV32IM-NEXT:    ret
+; RV32IM-NEXT:  .LBB45_2:
+; RV32IM-NEXT:    lui a0, 7
+; RV32IM-NEXT:    addi a0, a0, 1328
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst3:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    bnez a0, .LBB45_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, 5
+; RV64IM-NEXT:    addiw a0, a0, -480
+; RV64IM-NEXT:    ret
+; RV64IM-NEXT:  .LBB45_2:
+; RV64IM-NEXT:    lui a0, 7
+; RV64IM-NEXT:    addiw a0, a0, 1328
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst3:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    lui a1, 1048574
+; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, -1808
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    lui a1, 7
+; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, 1328
+; RV64IMXVTCONDOPS-NEXT:    add a0, a0, a1
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_cst3:
+; RV32IMZICOND:       # %bb.0:
+; RV32IMZICOND-NEXT:    lui a1, 1048574
+; RV32IMZICOND-NEXT:    addi a1, a1, -1808
+; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    lui a1, 7
+; RV32IMZICOND-NEXT:    addi a1, a1, 1328
+; RV32IMZICOND-NEXT:    add a0, a0, a1
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_cst3:
+; RV64IMZICOND:       # %bb.0:
+; RV64IMZICOND-NEXT:    lui a1, 1048574
+; RV64IMZICOND-NEXT:    addiw a1, a1, -1808
+; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    lui a1, 7
+; RV64IMZICOND-NEXT:    addiw a1, a1, 1328
+; RV64IMZICOND-NEXT:    add a0, a0, a1
+; RV64IMZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 30000, i32 20000
+  ret i32 %ret
+}
+
+define i32 @select_cst4(i1 zeroext %cond) {
+; CHECK-LABEL: select_cst4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    xori a0, a0, 2047
+; CHECK-NEXT:    ret
+  %ret = select i1 %cond, i32 -2048, i32 2047
+  ret i32 %ret
+}
+
+define i32 @select_cst5(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst5:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    mv a1, a0
+; RV32IM-NEXT:    li a0, 2047
+; RV32IM-NEXT:    bnez a1, .LBB47_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, 1
+; RV32IM-NEXT:    addi a0, a0, -2047
+; RV32IM-NEXT:  .LBB47_2:
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst5:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    mv a1, a0
+; RV64IM-NEXT:    li a0, 2047
+; RV64IM-NEXT:    bnez a1, .LBB47_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, 1
+; RV64IM-NEXT:    addiw a0, a0, -2047
+; RV64IM-NEXT:  .LBB47_2:
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst5:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    li a1, 2
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 2047
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; CHECKZICOND-LABEL: select_cst5:
+; CHECKZICOND:       # %bb.0:
+; CHECKZICOND-NEXT:    li a1, 2
+; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 2047
+; CHECKZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 2047, i32 2049
+  ret i32 %ret
+}
+
+define i32 @select_cst6(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst6:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    bnez a0, .LBB48_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    li a0, 2047
+; RV32IM-NEXT:    ret
+; RV32IM-NEXT:  .LBB48_2:
+; RV32IM-NEXT:    lui a0, 1
+; RV32IM-NEXT:    addi a0, a0, -2047
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst6:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    bnez a0, .LBB48_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    li a0, 2047
+; RV64IM-NEXT:    ret
+; RV64IM-NEXT:  .LBB48_2:
+; RV64IM-NEXT:    lui a0, 1
+; RV64IM-NEXT:    addiw a0, a0, -2047
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst6:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    li a1, 2
+; RV64IMXVTCONDOPS-NEXT:    vt.maskc a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 2047
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; CHECKZICOND-LABEL: select_cst6:
+; CHECKZICOND:       # %bb.0:
+; CHECKZICOND-NEXT:    li a1, 2
+; CHECKZICOND-NEXT:    czero.eqz a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 2047
+; CHECKZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 2049, i32 2047
+  ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index a2a953c..87f2a63 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -871,3 +871,64 @@ define void @zext_nneg_dominating_icmp_i32_zeroext(i16 signext %0) {
 5:
   ret void
 }
+
+; The load is used extended and non-extended in the successor basic block. The
+; signed compare will cause the non-extended value to exported out of the first
+; basic block using a sext to XLen. We need to CSE the zext nneg with the sext
+; so that we can form a sextload.
+define void @load_zext_nneg_sext_cse(ptr %p) nounwind {
+; RV32I-LABEL: load_zext_nneg_sext_cse:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lh s0, 0(a0)
+; RV32I-NEXT:    bltz s0, .LBB50_2
+; RV32I-NEXT:  # %bb.1: # %bb1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call bar_i16
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    tail bar_i32
+; RV32I-NEXT:  .LBB50_2: # %bb2
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: load_zext_nneg_sext_cse:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    lh s0, 0(a0)
+; RV64-NEXT:    bltz s0, .LBB50_2
+; RV64-NEXT:  # %bb.1: # %bb1
+; RV64-NEXT:    mv a0, s0
+; RV64-NEXT:    call bar_i16
+; RV64-NEXT:    mv a0, s0
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    tail bar_i32
+; RV64-NEXT:  .LBB50_2: # %bb2
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %load = load i16, ptr %p
+  %zext = zext nneg i16 %load to i32
+  %cmp = icmp sgt i16 %load, -1
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  tail call void @bar_i16(i16 signext %load)
+  tail call void @bar_i32(i32 signext %zext)
+  br label %bb2
+
+bb2:
+  ret void
+}
+declare void @bar_i16(i16);
diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
index fc35bc4..8d065da 100644
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV64I
 
 ; Check that memory accesses to array elements with large offsets have those
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index ec6e978..7fc4713 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32I %s
-; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32IM %s
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64IM %s
 
 define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index eea8e64..540883fdc 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV32I %s
-; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV32IM %s
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV64I %s
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV64IM %s
 
 
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll
new file mode 100644
index 0000000..30c1635
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll
@@ -0,0 +1,84 @@
+; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_usm_storage_classes/intel_usm_addrspaces.ll
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_usm_storage_classes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-EXT
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-extensions=SPV_INTEL_usm_storage_classes %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-WITHOUT
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-: Capability USMStorageClassesINTEL
+; CHECK-SPIRV-WITHOUT-NO: Capability USMStorageClassesINTEL
+; CHECK-SPIRV-EXT-DAG: %[[DevTy:[0-9]+]] = OpTypePointer DeviceOnlyINTEL %[[#]]
+; CHECK-SPIRV-EXT-DAG: %[[HostTy:[0-9]+]] = OpTypePointer HostOnlyINTEL %[[#]]
+; CHECK-SPIRV-DAG: %[[CrsWrkTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[#]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @foo_kernel() {
+entry:
+  ret void
+}
+
+; CHECK-SPIRV: %[[Ptr1:[0-9]+]] = OpLoad %[[CrsWrkTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr1:[0-9]+]] = OpCrossWorkgroupCastToPtrINTEL %[[DevTy]] %[[Ptr1]]
+; CHECK-SPIRV-WITHOUT-NOT: OpCrossWorkgroupCastToPtrINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr1]]
+define spir_func void @test1(ptr addrspace(1) %arg_glob, ptr addrspace(5) %arg_dev) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_dev.addr = alloca ptr addrspace(5), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(5) %arg_dev, ptr %arg_dev.addr, align 4
+  %loaded_glob = load ptr addrspace(1), ptr %arg_glob.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(1) %loaded_glob to ptr addrspace(5)
+  store ptr addrspace(5) %casted_ptr, ptr %arg_dev.addr, align 4
+  ret void
+}
+
+; CHECK-SPIRV: %[[Ptr2:[0-9]+]] = OpLoad %[[CrsWrkTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr2:[0-9]+]] = OpCrossWorkgroupCastToPtrINTEL %[[HostTy]] %[[Ptr2]]
+; CHECK-SPIRV-WITHOUT-NOT: OpCrossWorkgroupCastToPtrINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr2]]
+define spir_func void @test2(ptr addrspace(1) %arg_glob, ptr addrspace(6) %arg_host) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_host.addr = alloca ptr addrspace(6), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(6) %arg_host, ptr %arg_host.addr, align 4
+  %loaded_glob = load ptr addrspace(1), ptr %arg_glob.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(1) %loaded_glob to ptr addrspace(6)
+  store ptr addrspace(6) %casted_ptr, ptr %arg_host.addr, align 4
+  ret void
+}
+
+; CHECK-SPIRV-EXT: %[[Ptr3:[0-9]+]] = OpLoad %[[DevTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr3:[0-9]+]] = OpPtrCastToCrossWorkgroupINTEL %[[CrsWrkTy]] %[[Ptr3]]
+; CHECK-SPIRV-WITHOUT-NOT: OpPtrCastToCrossWorkgroupINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr3]]
+define spir_func void @test3(ptr addrspace(1) %arg_glob, ptr addrspace(5) %arg_dev) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_dev.addr = alloca ptr addrspace(5), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(5) %arg_dev, ptr %arg_dev.addr, align 4
+  %loaded_dev = load ptr addrspace(5), ptr %arg_dev.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(5) %loaded_dev to ptr addrspace(1)
+  store ptr addrspace(1) %casted_ptr, ptr %arg_glob.addr, align 4
+  ret void
+}
+
+; CHECK-SPIRV-EXT: %[[Ptr4:[0-9]+]] = OpLoad %[[HostTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr4:[0-9]+]] = OpPtrCastToCrossWorkgroupINTEL %[[CrsWrkTy]] %[[Ptr4]]
+; CHECK-SPIRV-WITHOUT-NOT: OpPtrCastToCrossWorkgroupINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr4]]
+define spir_func void @test4(ptr addrspace(1) %arg_glob, ptr addrspace(6) %arg_host) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_host.addr = alloca ptr addrspace(6), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(6) %arg_host, ptr %arg_host.addr, align 4
+  %loaded_host = load ptr addrspace(6), ptr %arg_host.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(6) %loaded_host to ptr addrspace(1)
+  store ptr addrspace(1) %casted_ptr, ptr %arg_glob.addr, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll
new file mode 100644
index 0000000..b1d6a09
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll
@@ -0,0 +1,357 @@
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_KHR_subgroup_rotate %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-extensions=SPV_KHR_subgroup_rotate %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpGroupNonUniformRotateKHR instruction requires the following SPIR-V extension: SPV_KHR_subgroup_rotate
+
+; CHECK: OpCapability GroupNonUniformRotateKHR
+; CHECK: OpExtension "SPV_KHR_subgroup_rotate"
+
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[TyInt16:.*]] = OpTypeInt 16 0
+; CHECK-DAG: %[[TyInt32:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyFloat:.*]] = OpTypeFloat 32
+; CHECK-DAG: %[[TyHalf:.*]] = OpTypeFloat 16
+; CHECK-DAG: %[[TyDouble:.*]] = OpTypeFloat 64
+; CHECK-DAG: %[[ScopeSubgroup:.*]] = OpConstant %[[TyInt32]] 3
+; CHECK-DAG: %[[ConstInt2:.*]] = OpConstant %[[TyInt32]] 2
+; CHECK-DAG: %[[ConstInt4:.*]] = OpConstant %[[TyInt32]] 4
+		
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateChar(ptr addrspace(1) noundef align 1 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i8, align 1
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i8 0, ptr %v, align 1
+  %value = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func signext i8 @_Z16sub_group_rotateci(i8 noundef signext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %data, i32 0
+  store i8 %call, ptr addrspace(1) %arrayidx, align 1
+  %value_clustered = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func signext i8 @_Z26sub_group_clustered_rotatecij(i8 noundef signext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %data2, i32 1
+  store i8 %call1, ptr addrspace(1) %arrayidx2, align 1
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i8 @_Z16sub_group_rotateci(i8 noundef signext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i8 @_Z26sub_group_clustered_rotatecij(i8 noundef signext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateUChar(ptr addrspace(1) noundef align 1 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i8, align 1
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i8 0, ptr %v, align 1
+  %value = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func zeroext i8 @_Z16sub_group_rotatehi(i8 noundef zeroext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %data, i32 0
+  store i8 %call, ptr addrspace(1) %arrayidx, align 1
+  %value_clustered = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func zeroext i8 @_Z26sub_group_clustered_rotatehij(i8 noundef zeroext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %data2, i32 1
+  store i8 %call1, ptr addrspace(1) %arrayidx2, align 1
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i8 @_Z16sub_group_rotatehi(i8 noundef zeroext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i8 @_Z26sub_group_clustered_rotatehij(i8 noundef zeroext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateShort(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !8 !kernel_arg_base_type !8 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i16, align 2
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i16 0, ptr %v, align 2
+  %value = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func signext i16 @_Z16sub_group_rotatesi(i16 noundef signext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %data, i32 0
+  store i16 %call, ptr addrspace(1) %arrayidx, align 2
+  %value_clustered = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func signext i16 @_Z26sub_group_clustered_rotatesij(i16 noundef signext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %data2, i32 1
+  store i16 %call1, ptr addrspace(1) %arrayidx2, align 2
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i16 @_Z16sub_group_rotatesi(i16 noundef signext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i16 @_Z26sub_group_clustered_rotatesij(i16 noundef signext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateUShort(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i16, align 2
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i16 0, ptr %v, align 2
+  %value = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func zeroext i16 @_Z16sub_group_rotateti(i16 noundef zeroext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %data, i32 0
+  store i16 %call, ptr addrspace(1) %arrayidx, align 2
+  %value_clustered = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func zeroext i16 @_Z26sub_group_clustered_rotatetij(i16 noundef zeroext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %data2, i32 1
+  store i16 %call1, ptr addrspace(1) %arrayidx2, align 2
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i16 @_Z16sub_group_rotateti(i16 noundef zeroext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i16 @_Z26sub_group_clustered_rotatetij(i16 noundef zeroext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateInt(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i32, align 4
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i32 0, ptr %v, align 4
+  %value = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i32 @_Z16sub_group_rotateii(i32 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %data, i32 0
+  store i32 %call, ptr addrspace(1) %arrayidx, align 4
+  %value_clustered = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i32 @_Z26sub_group_clustered_rotateiij(i32 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %data2, i32 1
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z16sub_group_rotateii(i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z26sub_group_clustered_rotateiij(i32 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateUInt(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i32, align 4
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i32 0, ptr %v, align 4
+  %value = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i32 @_Z16sub_group_rotateji(i32 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %data, i32 0
+  store i32 %call, ptr addrspace(1) %arrayidx, align 4
+  %value_clustered = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i32 @_Z26sub_group_clustered_rotatejij(i32 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %data2, i32 1
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z16sub_group_rotateji(i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z26sub_group_clustered_rotatejij(i32 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateLong(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i64, align 8
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i64 0, ptr %v, align 8
+  %value = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i64 @_Z16sub_group_rotateli(i64 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %data, i32 0
+  store i64 %call, ptr addrspace(1) %arrayidx, align 8
+  %value_clustered = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i64 @_Z26sub_group_clustered_rotatelij(i64 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i64, ptr addrspace(1) %data2, i32 1
+  store i64 %call1, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z16sub_group_rotateli(i64 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z26sub_group_clustered_rotatelij(i64 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateULong(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i64, align 8
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i64 0, ptr %v, align 8
+  %value = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i64 @_Z16sub_group_rotatemi(i64 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %data, i32 0
+  store i64 %call, ptr addrspace(1) %arrayidx, align 8
+  %value_clustered = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i64 @_Z26sub_group_clustered_rotatemij(i64 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i64, ptr addrspace(1) %data2, i32 1
+  store i64 %call1, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z16sub_group_rotatemi(i64 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z26sub_group_clustered_rotatemij(i64 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateFloat(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca float, align 4
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store float 0.000000e+00, ptr %v, align 4
+  %value = load float, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyFloat]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func float @_Z16sub_group_rotatefi(float noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %data, i32 0
+  store float %call, ptr addrspace(1) %arrayidx, align 4
+  %value_clustered = load float, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyFloat]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func float @_Z26sub_group_clustered_rotatefij(float noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %data2, i32 1
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func float @_Z16sub_group_rotatefi(float noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func float @_Z26sub_group_clustered_rotatefij(float noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateHalf(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca half, align 2
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store half 0xH0000, ptr %v, align 2
+  %value = load half, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyHalf]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func half @_Z16sub_group_rotateDhi(half noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds half, ptr addrspace(1) %data, i32 0
+  store half %call, ptr addrspace(1) %arrayidx, align 2
+  %value_clustered = load half, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyHalf]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func half @_Z26sub_group_clustered_rotateDhij(half noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds half, ptr addrspace(1) %data2, i32 1
+  store half %call1, ptr addrspace(1) %arrayidx2, align 2
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func half @_Z16sub_group_rotateDhi(half noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func half @_Z26sub_group_clustered_rotateDhij(half noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateDouble(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca double, align 8
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store double 0.000000e+00, ptr %v, align 8
+  %value = load double, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyDouble]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func double @_Z16sub_group_rotatedi(double noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds double, ptr addrspace(1) %data, i32 0
+  store double %call, ptr addrspace(1) %arrayidx, align 8
+  %value_clustered = load double, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyDouble]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func double @_Z26sub_group_clustered_rotatedij(double noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds double, ptr addrspace(1) %data2, i32 1
+  store double %call1, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func double @_Z16sub_group_rotatedi(double noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func double @_Z26sub_group_clustered_rotatedij(double noundef, i32 noundef, i32 noundef) #1
+
+attributes #0 = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
+attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { convergent nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!2 = !{!"clang version 19.0.0"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"char*"}
+!6 = !{!""}
+!7 = !{!"uchar*"}
+!8 = !{!"short*"}
+!9 = !{!"ushort*"}
+!10 = !{!"int*"}
+!11 = !{!"uint*"}
+!12 = !{!"long*"}
+!13 = !{!"ulong*"}
+!14 = !{!"float*"}
+!15 = !{!"half*"}
+!16 = !{!"double*"}
diff --git a/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll b/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll
new file mode 100644
index 0000000..c9c0f17
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll
@@ -0,0 +1,30 @@
+; The test is to check that jump tables are not generated from switch
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpSwitch %[[#]] %[[Label:]]
+; CHECK-4: OpBranch %[[Label]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_func void @foo(i32 noundef %val) {
+entry:
+  switch i32 %val, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb2
+    i32 2, label %sw.bb3
+    i32 3, label %sw.bb4
+  ]
+sw.bb:
+  br label %sw.epilog
+sw.bb2:
+  br label %sw.epilog
+sw.bb3:
+  br label %sw.epilog
+sw.bb4:
+  br label %sw.epilog
+sw.epilog:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll
index eb0ff4b..3ff6324 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll
@@ -20,7 +20,7 @@ declare i128 @llvm.experimental.constrained.fptoui.i128.f64(double, metadata)
 declare i128 @llvm.experimental.constrained.fptoui.i128.f32(float, metadata)
 
 ; Test signed i128->f128.
-define fp128 @f1(i128 %i) {
+define fp128 @f1(i128 %i) #0 {
 ; CHECK-LABEL: f1:
 ; CHECK: brasl %r14, __floattitf@PLT
 ; CHECK: br %r14
@@ -31,7 +31,7 @@ define fp128 @f1(i128 %i) {
 }
 
 ; Test signed i128->f64.
-define double @f2(i128 %i) {
+define double @f2(i128 %i) #0 {
 ; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __floattidf@PLT
 ; CHECK: br %r14
@@ -42,7 +42,7 @@ define double @f2(i128 %i) {
 }
 
 ; Test signed i128->f32.
-define float @f3(i128 %i) {
+define float @f3(i128 %i) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK: brasl %r14, __floattisf@PLT
 ; CHECK: br %r14
@@ -53,7 +53,7 @@ define float @f3(i128 %i) {
 }
 
 ; Test unsigned i128->f128.
-define fp128 @f4(i128 %i) {
+define fp128 @f4(i128 %i) #0 {
 ; CHECK-LABEL: f4:
 ; CHECK: brasl %r14, __floatuntitf@PLT
 ; CHECK: br %r14
@@ -64,7 +64,7 @@ define fp128 @f4(i128 %i) {
 }
 
 ; Test unsigned i128->f64.
-define double @f5(i128 %i) {
+define double @f5(i128 %i) #0 {
 ; CHECK-LABEL: f5:
 ; CHECK: brasl %r14, __floatuntidf@PLT
 ; CHECK: br %r14
@@ -75,7 +75,7 @@ define double @f5(i128 %i) {
 }
 
 ; Test unsigned i128->f32.
-define float @f6(i128 %i) {
+define float @f6(i128 %i) #0 {
 ; CHECK-LABEL: f6:
 ; CHECK: brasl %r14, __floatuntisf@PLT
 ; CHECK: br %r14
@@ -86,7 +86,7 @@ define float @f6(i128 %i) {
 }
 
 ; Test signed f128->i128.
-define i128 @f7(fp128 %f) {
+define i128 @f7(fp128 %f) #0 {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, __fixtfti@PLT
 ; CHECK: br %r14
@@ -96,7 +96,7 @@ define i128 @f7(fp128 %f) {
 }
 
 ; Test signed f64->i128.
-define i128 @f8(double %f) {
+define i128 @f8(double %f) #0 {
 ; CHECK-LABEL: f8:
 ; CHECK: brasl %r14, __fixdfti@PLT
 ; CHECK: br %r14
@@ -106,7 +106,7 @@ define i128 @f8(double %f) {
 }
 
 ; Test signed f9->i128.
-define i128 @f9(float %f) {
+define i128 @f9(float %f) #0 {
 ; CHECK-LABEL: f9:
 ; CHECK: brasl %r14, __fixsfti@PLT
 ; CHECK: br %r14
@@ -116,7 +116,7 @@ define i128 @f9(float %f) {
 }
 
 ; Test unsigned f128->i128.
-define i128 @f10(fp128 %f) {
+define i128 @f10(fp128 %f) #0 {
 ; CHECK-LABEL: f10:
 ; CHECK: brasl %r14, __fixunstfti@PLT
 ; CHECK: br %r14
@@ -126,7 +126,7 @@ define i128 @f10(fp128 %f) {
 }
 
 ; Test unsigned f64->i128.
-define i128 @f11(double %f) {
+define i128 @f11(double %f) #0 {
 ; CHECK-LABEL: f11:
 ; CHECK: brasl %r14, __fixunsdfti@PLT
 ; CHECK: br %r14
@@ -136,7 +136,7 @@ define i128 @f11(double %f) {
 }
 
 ; Test unsigned f32->i128.
-define i128 @f12(float %f) {
+define i128 @f12(float %f) #0 {
 ; CHECK-LABEL: f12:
 ; CHECK: brasl %r14, __fixunssfti@PLT
 ; CHECK: br %r14
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
index 4f33439..daf46c6 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
@@ -1,5 +1,5 @@
-; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=EH
-; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=SJLJ
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue -wasm-emit-multivalue 2>&1 | FileCheck %s --check-prefix=EH
+; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 -wasm-emit-multivalue | FileCheck %s --check-prefix=SJLJ
 
 ; Currently multivalue returning functions are not supported in Emscripten EH /
 ; SjLj. Make sure they error out.
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
index 4b4661b..4fadbd5 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -wasm-emit-multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
index 52a8c68..f4f93ac 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Test functions have been generated by multivalue-stackify.py.
 
-; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s
 
 ; Test that the multivalue stackification works
 
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue.ll b/llvm/test/CodeGen/WebAssembly/multivalue.ll
index 675009c..846691e 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call | FileCheck --check-prefix REF %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix REGS
-; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call | obj2yaml | FileCheck %s --check-prefix OBJ
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call -wasm-emit-multivalue | FileCheck --check-prefix REF %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s --check-prefix REGS
+; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | obj2yaml | FileCheck %s --check-prefix OBJ
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix NO-MULTIVALUE
 
 ; Test that the multivalue calls, returns, function types, and block
 ; types work as expected.
@@ -19,6 +20,7 @@ declare void @use_i64(i64)
 ; CHECK-NEXT: i32.const 42{{$}}
 ; CHECK-NEXT: i64.const 42{{$}}
 ; CHECK-NEXT: end_function{{$}}
+; NO-MULTIVALUE-NOT: .functype pair_const () -> (i32, i64)
 define %pair @pair_const() {
   ret %pair { i32 42, i64 42 }
 }
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
index 47c5ae7b..7bf37b5 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue | FileCheck %s --check-prefix=MULTIVALUE
+; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s --check-prefix=MULTIVALUE
 ; RUN: llc < %s -verify-machineinstrs -mcpu=mvp | FileCheck %s --check-prefix=NO_MULTIVALUE
 
 ; Test libcall signatures when multivalue is enabled and disabled
diff --git a/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll b/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll
new file mode 100644
index 0000000..6a856c3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i686-linux-gnu -global-isel < %s | FileCheck %s --check-prefix=X86
+
+declare ptr @foo()
+
+define ptr @aligned_tailcall() nounwind {
+; X64-LABEL: aligned_tailcall:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq foo
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: aligned_tailcall:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    calll foo
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %call = tail call align 8 ptr @foo()
+  ret ptr %call
+}
diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
index 401ed9f7..8a9ee60 100644
--- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
+++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -339,7 +339,7 @@ return:
 
 define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src) nounwind {
 ; CHECK-LABEL: strcpy_illegal_tailc:
-; CHECK:       ## %bb.0:
+; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movq %rdx, %rbx
 ; CHECK-NEXT:    testq %rsi, %rsi
@@ -351,6 +351,7 @@ define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src)
 ; CHECK-NEXT:    movq %rbx, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
+entry:
   %cmp = icmp eq i64 %sz, 0
   br i1 %cmp, label %return, label %if.then
 
@@ -362,8 +363,63 @@ return:
   ret ptr %src
 }
 
+@i = global i32 0, align 4
+
+define i32 @undef_tailc() nounwind {
+; CHECK-LABEL: undef_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    cmpl $0, _i(%rip)
+; CHECK-NEXT:    jne _qux ## TAILCALL
+; CHECK-NEXT:  ## %bb.1: ## %return
+; CHECK-NEXT:    retq
+entry:
+  %val = load i32, ptr @i, align 4
+  %cmp = icmp eq i32 %val, 0
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  %rv_unused = tail call i32 @qux()
+  br label %return
+
+return:
+  ret i32 undef
+}
+
+define i32 @undef_and_known_tailc() nounwind {
+; CHECK-LABEL: undef_and_known_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movl _i(%rip), %eax
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    je _qux ## TAILCALL
+; CHECK-NEXT:  ## %bb.1: ## %entry
+; CHECK-NEXT:    cmpl $2, %eax
+; CHECK-NEXT:    je _quux ## TAILCALL
+; CHECK-NEXT:  ## %bb.2: ## %return
+; CHECK-NEXT:    retq
+entry:
+  %val = load i32, ptr @i, align 4
+  switch i32 %val, label %return [
+    i32 2, label %case_2
+    i32 5, label %case_5
+  ]
+
+case_2:
+  %rv_unused = tail call i32 @quux()
+  br label %return
+
+case_5:
+  %rv = tail call i32 @qux()
+  br label %return
+
+return:
+  %phi = phi i32 [ undef, %case_2 ], [ %rv, %case_5 ], [ undef, %entry ]
+  ret i32 %phi
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1)
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1)
 declare noalias ptr @malloc(i64)
 declare ptr @strcpy(ptr noalias returned writeonly, ptr noalias nocapture readonly)
 declare ptr @baz(ptr, ptr)
+declare i32 @qux()
+declare i32 @quux()
diff --git a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test
index 33ad551..ec71011 100644
--- a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test
+++ b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test
@@ -5,4 +5,7 @@
 #
 # Use -sectcreate to create a section from a data file.
 
+# Jitlink does not support ARM64 COFF files.
+# UNSUPPORTED: target=aarch64-pc-windows-{{.*}}
+
 # jitlink-check: *{4}foo = 0x2a2a5a5a
 \ No newline at end of file
diff --git a/llvm/test/MC/AMDGPU/gfx11-promotions.s b/llvm/test/MC/AMDGPU/gfx11-promotions.s
index 0bd9026..67e7bea 100644
--- a/llvm/test/MC/AMDGPU/gfx11-promotions.s
+++ b/llvm/test/MC/AMDGPU/gfx11-promotions.s
@@ -337,17 +337,17 @@ v_dot2_f32_f16_e64 v0, v1, v2, v3
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x40,0x13,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x77,0x39,0x05]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x40,0x13,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP3P.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3 quad_perm:[1,2,3,0]
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0]
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s
index 2cfb8ab..3ff4ed2 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s
@@ -2,10 +2,10 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
 
 v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x04,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff]
 
 v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] bank_mask:0xe
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
 
 v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0
 // GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x01,0xf1]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s
index 2656ba0..3fb993d 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s
@@ -15,4 +15,4 @@ v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:
 // GFX11: encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
+// GFX11: encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s
index 75bd169..a636068 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s
@@ -2,10 +2,10 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
 
 v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1
-// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x04,0xff]
+// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff]
 
 v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] bank_mask:0xe
-// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
+// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
 
 v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0
 // GFX12: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x01,0xf1]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s
index 14cf169..2993393 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s
@@ -15,7 +15,7 @@ v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:
 // GFX12: encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
 
 v_dot4_f32_fp8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
 // GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x24,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt
index 6b23036..ceca6d9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt
@@ -1,11 +1,11 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
-# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
-0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
+0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe
 
-# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff]
-0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff]
+0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff
 
 # GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1]
 0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt
index 89c9b54..57c9617 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt
@@ -1,8 +1,8 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
-# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
-0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
+0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05
 
 # GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
 0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt
index 52fd053..10f4384 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt
@@ -1,11 +1,11 @@
 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
 
-# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
-0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe
+# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
+0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe
 
-# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff]
-0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff
+# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff]
+0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff
 
 # GFX12: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1]
 0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt
index 688212e..2fb9c23 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt
@@ -1,8 +1,8 @@
 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
 
-# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
-0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05
+# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
+0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05
 
 # GFX12: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
 0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 51fb93d..285077f 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -72,10 +72,6 @@
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ
 
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
-; RUN:     -passes='default<O3>' -enable-jump-table-to-switch -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-JUMP-TABLE-TO-SWITCH,CHECK-O23SZ,%llvmcheckext
-
-; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -enable-matrix -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX
 
@@ -155,7 +151,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
-; CHECK-JUMP-TABLE-TO-SWITCH-NEXT: Running pass: JumpTableToSwitchPass
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 064362e..29a4d79 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -90,6 +90,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 19a4486..bf06782 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -78,6 +78,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index ac80a31..0cc6112 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -86,6 +86,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 6486639..0e58397 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -121,6 +121,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 09f9f0f..68c2e58 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -118,6 +118,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 47bdbfd..8311a00 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -90,6 +90,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/TableGen/HwModeEncodeDecode2.td b/llvm/test/TableGen/HwModeEncodeDecode2.td
new file mode 100644
index 0000000..5159501
--- /dev/null
+++ b/llvm/test/TableGen/HwModeEncodeDecode2.td
@@ -0,0 +1,119 @@
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | \
+// RUN:     FileCheck %s --check-prefix=DECODER
+// RUN: llvm-tblgen -gen-disassembler --suppress-per-hwmode-duplicates -I \
+// RUN:     %p/../../include %s | FileCheck %s --check-prefix=DECODER-SUPPRESS
+
+// Test duplicate table suppression for per-HwMode decoders.
+
+include "llvm/Target/Target.td"
+
+def archInstrInfo : InstrInfo { }
+
+def arch : Target {
+    let InstructionSet = archInstrInfo;
+}
+
+def  Myi32  : Operand<i32> {
+  let DecoderMethod = "DecodeMyi32";
+}
+
+def HasA : Predicate<"Subtarget->hasA()">;
+def HasB : Predicate<"Subtarget->hasB()">;
+
+def ModeA : HwMode<"+a", [HasA]>;
+def ModeB : HwMode<"+b", [HasB]>;
+
+
+def fooTypeEncA : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{7...0} = factor;
+  let Inst{3...2} = 0b11;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncB : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{15...8} = factor;
+  let Inst{1...0} = 0b11;
+}
+
+let OutOperandList = (outs) in {
+  def foo : Instruction {
+    let InOperandList = (ins i32imm:$factor);
+    let EncodingInfos = EncodingByHwMode<
+      [ModeA, ModeB], [fooTypeEncA, fooTypeEncB]
+    >;
+    let AsmString = "foo  $factor";
+  }
+
+  // Encoding not overridden, same namespace:
+  // In the default case, this instruction is duplicated into both ModeA and
+  // ModeB decoder tables.
+  // In the suppressed case, this instruction appears in a single decoder table.
+  def bar: Instruction {
+    let InOperandList = (ins i32imm:$factor);
+    let Size = 4;
+    bits<32> Inst;
+    bits<32> SoftFail;
+    bits<8> factor;
+    let Inst{31...24} = factor;
+    let Inst{1...0} = 0b10;
+    let AsmString = "bar  $factor";
+  }
+
+  def baz : Instruction {
+    let InOperandList = (ins i32imm:$factor);
+    bits<32> Inst;
+    let EncodingInfos = EncodingByHwMode<
+      [ModeB], [fooTypeEncA]
+    >;
+    let AsmString = "foo  $factor";
+  }
+
+  // Encoding not overridden, different namespace:
+  // In the default case, this instruction is duplicated into two Alt decoder
+  // tables (ModeA and ModeB).
+  // In the suppressed case, this instruction appears in a single decoder table.
+  def unrelated: Instruction {
+    let DecoderNamespace = "Alt";
+    let InOperandList = (ins i32imm:$factor);
+    let Size = 4;
+    bits<32> Inst;
+    bits<32> SoftFail;
+    bits<8> factor;
+    let Inst{31...24} = factor;
+    let Inst{1...0} = 0b10;
+    let AsmString = "unrelated  $factor";
+  }
+}
+
+// DECODER-LABEL: DecoderTableAlt_ModeA32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTableAlt_ModeB32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTable_ModeA32[] =
+// DECODER-DAG: Opcode: fooTypeEncA:foo
+// DECODER-DAG: Opcode: bar
+// DECODER-LABEL: DecoderTable_ModeB32[] =
+// DECODER-DAG: Opcode: fooTypeEncB:foo
+// DECODER-DAG: Opcode: fooTypeEncA:baz
+// DECODER-DAG: Opcode: bar
+
+
+// DECODER-SUPPRESS-LABEL: DecoderTableAlt_AllModes32[] =
+// DECODER-SUPPRESS-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-LABEL: DecoderTable_AllModes32[] =
+// DECODER-SUPPRESS-DAG: Opcode: bar
+// DECODER-SUPPRESS-LABEL: DecoderTable_ModeA32[] =
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:foo
+// DECODER-SUPPRESS-NOT: Opcode: bar
+// DECODER-SUPPRESS-LABEL: DecoderTable_ModeB32[] =
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncB:foo
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:baz
+// DECODER-SUPPRESS-NOT: Opcode: bar
diff --git a/llvm/test/Transforms/ConstraintElimination/minmax.ll b/llvm/test/Transforms/ConstraintElimination/minmax.ll
index ab3e9f3..029b650 100644
--- a/llvm/test/Transforms/ConstraintElimination/minmax.ll
+++ b/llvm/test/Transforms/ConstraintElimination/minmax.ll
@@ -611,8 +611,7 @@ define i64 @pr82271(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SB]], i64 [[ADD]])
-; CHECK-NEXT:    ret i64 [[SMAX]]
+; CHECK-NEXT:    ret i64 [[SB]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i64 0
 ;
@@ -641,8 +640,7 @@ define i64 @pr82271_sext_zext_nneg(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = zext nneg i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SB]], i64 [[ADD]])
-; CHECK-NEXT:    ret i64 [[SMAX]]
+; CHECK-NEXT:    ret i64 [[SB]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i64 0
 ;
@@ -671,8 +669,7 @@ define i64 @pr82271_zext_nneg(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = zext nneg i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = zext nneg i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SB]], i64 [[ADD]])
-; CHECK-NEXT:    ret i64 [[SMAX]]
+; CHECK-NEXT:    ret i64 [[SB]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i64 0
 ;
diff --git a/llvm/test/Transforms/ConstraintElimination/sext.ll b/llvm/test/Transforms/ConstraintElimination/sext.ll
index ed8dd50..5a8a37d 100644
--- a/llvm/test/Transforms/ConstraintElimination/sext.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sext.ll
@@ -11,8 +11,7 @@ define i1 @cmp_sext(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
@@ -31,33 +30,32 @@ else:
   ret i1 false
 }
 
-define i1 @cmp_sext_positive_increment(i32 %a, i32 %b, i64 %c){
-; CHECK-LABEL: define i1 @cmp_sext_positive_increment(
-; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) {
+define i1 @cmp_sext_add(i32 %a, i32 %b){
+; CHECK-LABEL: define i1 @cmp_sext_add(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[POS:%.*]] = icmp sgt i64 [[C]], 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[POS]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
-; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], [[C]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    [[A1:%.*]] = add nsw i32 [[A]], 1
+; CHECK-NEXT:    [[B1:%.*]] = add nsw i32 [[B]], 1
+; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A1]] to i64
+; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
 entry:
-  %pos = icmp sgt i64 %c, 0
-  call void @llvm.assume(i1 %pos)
   %cmp = icmp slt i32 %a, %b
   br i1 %cmp, label %then, label %else
 
 then:
-  %sa = sext i32 %a to i64
-  %sb = sext i32 %b to i64
-  %add = add nsw i64 %sa, %c
+  %a1 = add nsw i32 %a, 1
+  %b1 = add nsw i32 %b, 1
+  %sa = sext i32 %a1 to i64
+  %sb = sext i32 %b1 to i64
+  %add = add nsw i64 %sa, 1
   %cmp2 = icmp sge i64 %sb, %add
   ret i1 %cmp2
 
@@ -65,30 +63,33 @@ else:
   ret i1 false
 }
 
-define i1 @cmp_sext_sgt(i32 %a, i32 %b){
-; CHECK-LABEL: define i1 @cmp_sext_sgt(
-; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+define i1 @cmp_sext_dynamic_increment(i32 %a, i32 %b, i64 %c){
+; CHECK-LABEL: define i1 @cmp_sext_dynamic_increment(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[POS:%.*]] = icmp slt i64 [[C]], 2
+; CHECK-NEXT:    call void @llvm.assume(i1 [[POS]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], [[C]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
 entry:
+  %pos = icmp slt i64 %c, 2
+  call void @llvm.assume(i1 %pos)
   %cmp = icmp slt i32 %a, %b
   br i1 %cmp, label %then, label %else
 
 then:
   %sa = sext i32 %a to i64
   %sb = sext i32 %b to i64
-  %add = add nsw i64 %sa, 1
-  %cmp2 = icmp sgt i64 %sb, %add
+  %add = add nsw i64 %sa, %c
+  %cmp2 = icmp sge i64 %sb, %add
   ret i1 %cmp2
 
 else:
@@ -105,8 +106,7 @@ define i1 @cmp_zext_nneg(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = zext nneg i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = zext nneg i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
@@ -216,3 +216,33 @@ then:
 else:
   ret i1 false
 }
+
+define i1 @cmp_sext_sgt(i32 %a, i32 %b){
+; CHECK-LABEL: define i1 @cmp_sext_sgt(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[SB]], [[ADD]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp slt i32 %a, %b
+  br i1 %cmp, label %then, label %else
+
+then:
+  %sa = sext i32 %a to i64
+  %sb = sext i32 %b to i64
+  %add = add nsw i64 %sa, 1
+  %cmp2 = icmp sgt i64 %sb, %add
+  ret i1 %cmp2
+
+else:
+  ret i1 false
+}
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll
index 705b6e9..c9ee233 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll
@@ -71,7 +71,6 @@ define i8 @test6(i8 %x) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp uge i8 [[X:%.*]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.umin.i8(i8 [[X]], i8 42)
 ; CHECK-NEXT:    ret i8 42
 ;
   %lim = icmp uge i8 %x, 42
@@ -119,7 +118,6 @@ define i8 @test10(i8 %x) {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp ule i8 [[X:%.*]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 42)
 ; CHECK-NEXT:    ret i8 42
 ;
   %lim = icmp ule i8 %x, 42
@@ -167,7 +165,6 @@ define i8 @test14(i8 %x) {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp sge i8 [[X:%.*]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 42)
 ; CHECK-NEXT:    ret i8 42
 ;
   %lim = icmp sge i8 %x, 42
@@ -179,8 +176,8 @@ define i8 @test15(i8 %x) {
 ; CHECK-LABEL: @test15(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp sge i8 [[X:%.*]], 41
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 42)
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X]], i8 42)
+; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %lim = icmp sge i8 %x, 41
   call void @llvm.assume(i1 %lim)
@@ -192,8 +189,8 @@ define i8 @test16(i8 %x) {
 ; CHECK-LABEL: @test16(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp sge i8 [[X:%.*]], 41
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 42)
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 42)
+; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %lim = icmp sge i8 %x, 41
   call void @llvm.assume(i1 %lim)
@@ -215,7 +212,6 @@ define i8 @test18(i8 %x) {
 ; CHECK-LABEL: @test18(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp sle i8 [[X:%.*]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 42)
 ; CHECK-NEXT:    ret i8 42
 ;
   %lim = icmp sle i8 %x, 42
@@ -235,3 +231,127 @@ define i8 @test19(i8 %x) {
   %r = call i8 @llvm.smax(i8 %x, i8 42)
   ret i8 %r
 }
+
+declare void @body(i32)
+
+define void @test_bidirectional() {
+; CHECK-LABEL: @test_bidirectional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    call void @body(i32 65535)
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[INDVAR]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INDVAR]], 65535
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvar = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %smax = call i32 @llvm.smax.i32(i32 %indvar, i32 65535)
+  call void @body(i32 %smax)
+  %inc = add nsw i32 %indvar, 1
+  %cmp = icmp slt i32 %indvar, 65535
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define i64 @test_at_use(i1 %cond, i64 %x) {
+; CHECK-LABEL: @test_at_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB1:%.*]], label [[IF_END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[X:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i64 0
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[X]], [[BB1]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+entry:
+  br i1 %cond, label %bb1, label %if.end
+
+bb1:
+  %val = call i64 @llvm.smax.i64(i64 %x, i64 -1)
+  %cmp = icmp slt i64 %x, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  ret i64 0
+
+if.end:
+  %phi = phi i64 [%val, %bb1], [0, %entry]
+  ret i64 %phi
+}
+
+define i8 @test_smax_to_umax_nneg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_smax_to_umax_nneg(
+; CHECK-NEXT:    [[NNEG_A:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[NNEG_B:%.*]] = and i8 [[B:%.*]], 127
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[NNEG_A]], i8 [[NNEG_B]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %nneg_a = and i8 %a, 127
+  %nneg_b = and i8 %b, 127
+  %ret = call i8 @llvm.smax.i8(i8 %nneg_a, i8 %nneg_b)
+  ret i8 %ret
+}
+
+define i8 @test_smax_to_umax_neg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_smax_to_umax_neg(
+; CHECK-NEXT:    [[NEG_A:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[NEG_B:%.*]] = or i8 [[B:%.*]], -128
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[NEG_A]], i8 [[NEG_B]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %neg_a = or i8 %a, 128
+  %neg_b = or i8 %b, 128
+  %ret = call i8 @llvm.smax.i8(i8 %neg_a, i8 %neg_b)
+  ret i8 %ret
+}
+
+define i8 @test_smin_to_umin_nneg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_smin_to_umin_nneg(
+; CHECK-NEXT:    [[NNEG_A:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[NNEG_B:%.*]] = and i8 [[B:%.*]], 127
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[NNEG_A]], i8 [[NNEG_B]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %nneg_a = and i8 %a, 127
+  %nneg_b = and i8 %b, 127
+  %ret = call i8 @llvm.smin.i8(i8 %nneg_a, i8 %nneg_b)
+  ret i8 %ret
+}
+
+define i8 @test_smin_to_umin_neg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_smin_to_umin_neg(
+; CHECK-NEXT:    [[NEG_A:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[NEG_B:%.*]] = or i8 [[B:%.*]], -128
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[NEG_A]], i8 [[NEG_B]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %neg_a = or i8 %a, 128
+  %neg_b = or i8 %b, 128
+  %ret = call i8 @llvm.smin.i8(i8 %neg_a, i8 %neg_b)
+  ret i8 %ret
+}
+
+define i8 @test_umax_nneg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_umax_nneg(
+; CHECK-NEXT:    [[NNEG_A:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[NNEG_B:%.*]] = and i8 [[B:%.*]], 127
+; CHECK-NEXT:    [[RET:%.*]] = call i8 @llvm.umax.i8(i8 [[NNEG_A]], i8 [[NNEG_B]])
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %nneg_a = and i8 %a, 127
+  %nneg_b = and i8 %b, 127
+  %ret = call i8 @llvm.umax.i8(i8 %nneg_a, i8 %nneg_b)
+  ret i8 %ret
+}
diff --git a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
index 2154fb5..8bc7114 100644
--- a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
+++ b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
@@ -13,13 +13,13 @@ target triple = "aarch64"
 ; CHECK-NOT: @llvm.assume
 ; CHECK: }
 ; CHECK: declare {{.*}}@llvm.assume
-; CHECK: define {{.*}}@f.cold.1()
-; CHECK-LABEL: newFuncRoot:
-; CHECK: }
-; CHECK: define {{.*}}@f.cold.2(i64 %load1)
+; CHECK: define {{.*}}@f.cold.1(i64 %load1)
 ; CHECK-LABEL: newFuncRoot:
 ; CHECK: %cmp1 = icmp eq i64 %load1, 0
 ; CHECK-NOT: call void @llvm.assume
+; CHECK: define {{.*}}@f.cold.2()
+; CHECK-LABEL: newFuncRoot:
+; CHECK: }
 
 define void @f() {
 entry:
diff --git a/llvm/test/Transforms/HotColdSplit/eh-pads.ll b/llvm/test/Transforms/HotColdSplit/eh-pads.ll
index 415c7e4..ad7baf9 100644
--- a/llvm/test/Transforms/HotColdSplit/eh-pads.ll
+++ b/llvm/test/Transforms/HotColdSplit/eh-pads.ll
@@ -84,13 +84,16 @@ cold4:
 ; CHECK: sink
 
 ; CHECK-LABEL: define {{.*}}@bar.cold.1(
+; CHECK: sideeffect(i32 0)
+
+; CHECK-LABEL: define {{.*}}@bar.cold.2(
 ; CHECK: sideeffect(i32 1)
 
 ; CHECK-LABEL: define {{.*}}@baz.cold.1(
-; CHECK: sideeffect(i32 1)
+; CHECK: sideeffect(i32 0)
 
 ; CHECK-LABEL: define {{.*}}@baz.cold.2(
-; CHECK: sideeffect(i32 0)
+; CHECK: sideeffect(i32 1)
 
 declare void @sideeffect(i32)
 
diff --git a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
index 65f8aad..0c05598 100644
--- a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
+++ b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
@@ -1,10 +1,10 @@
 ; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-threshold=-1 < %s 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: define {{.*}}@fun
-; CHECK: call {{.*}}@fun.cold.2(
-; CHECK-NEXT: ret void
 ; CHECK: call {{.*}}@fun.cold.1(
 ; CHECK-NEXT: ret void
+; CHECK: call {{.*}}@fun.cold.2(
+; CHECK-NEXT: ret void
 define void @fun() {
 entry:
   br i1 undef, label %A.then, label %A.else
@@ -49,9 +49,10 @@ B.cleanup:
 }
 
 ; CHECK-LABEL: define {{.*}}@fun.cold.1(
-; CHECK: %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
+; CHECK: %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
 ; CHECK-NEXT: unreachable
 
 ; CHECK-LABEL: define {{.*}}@fun.cold.2(
-; CHECK: %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
+; CHECK: %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
 ; CHECK-NEXT: unreachable
+
diff --git a/llvm/test/Transforms/HotColdSplit/outline-inner-region.ll b/llvm/test/Transforms/HotColdSplit/outline-inner-region.ll
new file mode 100644
index 0000000..73398bf
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-inner-region.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-max-params=1 < %s | FileCheck %s
+
+target datalayout = "E-m:a-p:32:32-i64:64-n32"
+target triple = "powerpc64-ibm-aix7.2.0.0"
+
+define void @foo(i32 %cond) {
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK:       if.then:
+; CHECK:       br i1 {{.*}}, label %if.then1, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT:  call void @foo.cold.1
+;
+entry:
+  %cond.addr = alloca i32
+  store i32 %cond, ptr %cond.addr
+  %0 = load i32, ptr %cond.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end2
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %cond.addr
+  call void @sink(i32 %0)
+  %cmp = icmp sgt i32 %1, 10
+  br i1 %cmp, label %if.then1, label %if.else
+
+if.then1:                                         ; preds = %if.then
+  call void @sideeffect(i32 2)
+  br label %if.end
+
+if.else:                                          ; preds = %if.then
+  call void @sink(i32 0)
+  call void @sideeffect(i32 0)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then1
+  br label %if.end2
+
+if.end2:                                          ; preds = %entry
+  call void @sideeffect(i32 1)
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK:       call {{.*}}@sink
+; CHECK-NEXT:  call {{.*}}@sideeffect
+
+declare void @sideeffect(i32)
+
+declare void @sink(i32) cold
diff --git a/llvm/test/Transforms/HotColdSplit/outline-outer-region.ll b/llvm/test/Transforms/HotColdSplit/outline-outer-region.ll
new file mode 100644
index 0000000..4a3c9698
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-outer-region.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-threshold=2 < %s | FileCheck %s
+
+target datalayout = "E-m:a-p:32:32-i64:64-n32"
+target triple = "powerpc64-ibm-aix7.2.0.0"
+
+define void @foo(i32 %cond, i32 %s0, i32 %s1) {
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK:       br i1 {{.*}}, label %codeRepl, label %if.end2
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @foo.cold.1
+; CHECK-LABEL: if.end2:
+; CHECK: call void @sideeffect
+;
+entry:
+  %cond.addr = alloca i32
+  store i32 %cond, ptr %cond.addr
+  %0 = load i32, ptr %cond.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end2
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %cond.addr
+  %cmp = icmp sgt i32 %1, 10
+  br i1 %cmp, label %if.then1, label %if.else
+
+if.then1:                                         ; preds = %if.then
+  call void @sideeffect(i32 0)
+  br label %if.end
+
+if.else:                                          ; preds = %if.then
+  call void @sink(i32 %s0)
+  call void @sideeffect(i32 1)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then1
+  call void @sink(i32 %0)
+  ret void
+
+if.end2:                                          ; preds = %entry
+  call void @sideeffect(i32 %s1)
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK: call {{.*}}@sink
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sink
+
+declare void @sideeffect(i32)
+
+declare void @sink(i32) cold
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll
new file mode 100644
index 0000000..717bd09
--- /dev/null
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -S -passes=infer-address-spaces --verify-each %s | FileCheck %s
+
+; Inst can use a value multiple time. When we're inserting an addrspacecast to flat,
+; it's important all the identical uses use an indentical replacement, especially
+; for PHIs.
+
+define amdgpu_kernel void @test_phi() {
+; CHECK-LABEL: @test_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOADED_PTR:%.*]] = load ptr, ptr addrspace(4) null, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[LOADED_PTR]] to ptr addrspace(1)
+; CHECK-NEXT:    br label [[BB0:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr addrspace(1) [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[GEP]] to ptr
+; CHECK-NEXT:    switch i32 0, label [[END:%.*]] [
+; CHECK-NEXT:      i32 1, label [[END]]
+; CHECK-NEXT:      i32 4, label [[END]]
+; CHECK-NEXT:      i32 5, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr addrspace(1) [[GEP]], align 16
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RETVAL_SROA_0_0_I569_PH:%.*]] = phi ptr [ null, [[BB1]] ], [ [[TMP1]], [[BB0]] ], [ [[TMP1]], [[BB0]] ], [ [[TMP1]], [[BB0]] ]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %loaded.ptr = load ptr, ptr addrspace(4) null, align 8
+  br label %bb0
+
+bb0:
+  %gep = getelementptr i64, ptr %loaded.ptr, i64 3
+  switch i32 0, label %end [
+  i32 1, label %end
+  i32 4, label %end
+  i32 5, label %bb1
+  ]
+
+bb1:
+  %0 = load double, ptr %gep, align 16
+  br label %end
+
+end:
+  %retval.sroa.0.0.i569.ph = phi ptr [ null, %bb1 ], [ %gep, %bb0 ], [ %gep, %bb0 ], [ %gep, %bb0 ]
+  ret void
+}
+
+declare void @uses_ptrs(ptr, ptr, ptr)
+
+; We shouldn't treat PHIs differently, even other users should have the same treatment.
+; All occurences of %gep are replaced with an identical value.
+define amdgpu_kernel void @test_other() {
+; CHECK-LABEL: @test_other(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOADED_PTR:%.*]] = load ptr, ptr addrspace(4) null, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[LOADED_PTR]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[TMP1]], i64 3
+; CHECK-NEXT:    call void @uses_ptrs(ptr [[GEP]], ptr [[GEP]], ptr [[GEP]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %loaded.ptr = load ptr, ptr addrspace(4) null, align 8
+  %gep = getelementptr i64, ptr %loaded.ptr, i64 3
+  call void @uses_ptrs(ptr %gep, ptr %gep, ptr %gep)
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/inline-sign-return-address.ll b/llvm/test/Transforms/Inline/inline-sign-return-address.ll
new file mode 100644
index 0000000..c4d85fa
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-sign-return-address.ll
@@ -0,0 +1,104 @@
+; Check the inliner doesn't inline a function with different sign return address schemes.
+; RUN: opt < %s -passes=inline -S | FileCheck %s
+
+define internal void @foo_all() #0 {
+  ret void
+}
+
+define internal void @foo_nonleaf() #1 {
+  ret void
+}
+
+define internal void @foo_none() #2 {
+  ret void
+}
+
+define internal void @foo_lr() #3 {
+  ret void
+}
+
+define internal void @foo_bkey() #4 {
+  ret void
+}
+
+define dso_local void @bar_all() #0 {
+; CHECK-LABEL: bar_all
+; CHECK-NOT:     call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_nonleaf() #1 {
+; CHECK-LABEL: bar_nonleaf
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NOT:     call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_none() #2 {
+; CHECK-LABEL: bar_none
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NOT:     call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_lr() #3 {
+; CHECK-LABEL: bar_lr
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NOT:     call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_bkey() #4 {
+; CHECK-LABEL: bar_bkey
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NOT:     call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+
+attributes #0 = { "branch-protection-pauth-lr"="false" "sign-return-address"="all" }
+attributes #1 = { "branch-protection-pauth-lr"="false" "sign-return-address"="non-leaf" }
+attributes #2 = { "branch-protection-pauth-lr"="false" "sign-return-address"="none" }
+attributes #3 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" }
+attributes #4 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" }
+\ No newline at end of file
diff --git a/llvm/test/Transforms/InstCombine/fpextend.ll b/llvm/test/Transforms/InstCombine/fpextend.ll
index a41f2a4..19f512d 100644
--- a/llvm/test/Transforms/InstCombine/fpextend.ll
+++ b/llvm/test/Transforms/InstCombine/fpextend.ll
@@ -437,3 +437,14 @@ define half @bf16_to_f32_to_f16(bfloat %a) nounwind {
   %z = fptrunc float %y to half
   ret half %z
 }
+
+define bfloat @bf16_frem(bfloat %x) {
+; CHECK-LABEL: @bf16_frem(
+; CHECK-NEXT:    [[FREM:%.*]] = frem bfloat [[X:%.*]], 0xR40C9
+; CHECK-NEXT:    ret bfloat [[FREM]]
+;
+  %t1 = fpext bfloat %x to float
+  %t2 = frem float %t1, 6.281250e+00
+  %t3 = fptrunc float %t2 to bfloat
+  ret bfloat %t3
+}
diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll
index 3b0e5b4..f277d13 100644
--- a/llvm/test/Transforms/InstCombine/not.ll
+++ b/llvm/test/Transforms/InstCombine/not.ll
@@ -769,3 +769,92 @@ entry:
   %cmp = icmp sle i32 %select, %not.c
   ret i1 %cmp
 }
+
+define i32 @test_sext(i32 %a, i32 %b){
+; CHECK-LABEL: @test_sext(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[NOT:%.*]] = sub i32 [[TMP2]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sext = sext i1 %cmp to i32
+  %add = add i32 %b, %sext
+  %not = xor i32 %add, -1
+  ret i32 %not
+}
+
+define <2 x i32> @test_sext_vec(<2 x i32> %a, <2 x i32> %b){
+; CHECK-LABEL: @test_sext_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[NOT:%.*]] = sub <2 x i32> [[TMP2]], [[B:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[NOT]]
+;
+  %cmp = icmp eq <2 x i32> %a, zeroinitializer
+  %sext = sext <2 x i1> %cmp to <2 x i32>
+  %add = add <2 x i32> %b, %sext
+  %not = xor <2 x i32> %add, <i32 -1, i32 -1>
+  ret <2 x i32> %not
+}
+
+define i64 @test_zext_nneg(i32 %c1, i64 %c2, i64 %c3){
+; CHECK-LABEL: @test_zext_nneg(
+; CHECK-NEXT:    [[DOTNEG:%.*]] = add i64 [[C2:%.*]], -4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[C1:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[C3:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = add i64 [[DOTNEG]], [[TMP2]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %not = xor i32 %c1, -1
+  %conv = zext nneg i32 %not to i64
+  %add1 = add i64 %c2, -5
+  %add2 = add i64 %conv, %c3
+  %sub = sub i64 %add1, %add2
+  ret i64 %sub
+}
+
+define i8 @test_trunc(i8 %a){
+; CHECK-LABEL: @test_trunc(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i8 [[A:%.*]], 0
+; CHECK-NEXT:    [[NOT:%.*]] = sext i1 [[TMP1]] to i8
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %zext = zext i8 %a to i32
+  %sub = add nsw i32 %zext, -1
+  %shr = ashr i32 %sub, 31
+  %conv = trunc i32 %shr to i8
+  %not = xor i8 %conv, -1
+  ret i8 %not
+}
+
+define <2 x i8> @test_trunc_vec(<2 x i8> %a){
+; CHECK-LABEL: @test_trunc_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i8> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[NOT:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[NOT]]
+;
+  %zext = zext <2 x i8> %a to <2 x i32>
+  %sub = add nsw <2 x i32> %zext, <i32 -1, i32 -1>
+  %shr = ashr <2 x i32> %sub, <i32 31, i32 31>
+  %conv = trunc <2 x i32> %shr to <2 x i8>
+  %not = xor <2 x i8> %conv, <i8 -1, i8 -1>
+  ret <2 x i8> %not
+}
+
+; Negative tests
+
+define i32 @test_zext(i32 %a, i32 %b){
+; CHECK-LABEL: @test_zext(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEXT]], [[B:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[ADD]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sext = zext i1 %cmp to i32
+  %add = add i32 %b, %sext
+  %not = xor i32 %add, -1
+  ret i32 %not
+}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll
new file mode 100644
index 0000000..8ca7f00
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=loop-reduce -mtriple=riscv64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %p, i8 %arg, i32 %start) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]], i8 [[ARG:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[ARG]] to i32
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[CONV]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[START]], [[SHR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD810:%.*]] = phi i32 [ [[START]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[ADD810]] to i64
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[P]], i64 [[IDXPROM2]]
+; CHECK-NEXT:    [[V:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[ADD]] = add i32 [[ADD810]], 1
+; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq i32 [[ADD]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %arg to i32
+  %shr = lshr i32 %conv, 1
+  %wide.trip.count = zext nneg i32 %shr to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add810 = phi i32 [ %start, %entry ], [ %add, %for.body ]
+  %idxprom2 = zext i32 %add810 to i64
+  %arrayidx3 = getelementptr i8, ptr %p, i64 %idxprom2
+  %v = load i8, ptr %arrayidx3, align 1
+  %add = add i32 %add810, 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 44ace37..3e895edc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S < %s -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve 2>&1 | FileCheck %s
 
-define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
+define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,16) {
 ; CHECK-LABEL: define void @clamped_tc_8(
 ; CHECK-SAME: ptr nocapture [[DST:%.*]], i32 [[N:%.*]], i64 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -18,20 +18,15 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 8, [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 8, [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i64> [[TMP13]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 1, [[TMP16]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP17]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 8 x i64> [[TMP8]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP11]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
@@ -40,17 +35,17 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc <vscale x 8 x i64> [[TMP20]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <vscale x 8 x i64> [[TMP15]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP11]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -61,8 +56,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP24]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = trunc i64 [[SHR3]] to i8
 ; CHECK-NEXT:    store i8 [[CONV4]], ptr [[P_OUT_TAIL_09]], align 1
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1
@@ -91,7 +86,7 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 }
 
-define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){
+define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,16) {
 ; CHECK-LABEL: define void @clamped_tc_max_8(
 ; CHECK-SAME: ptr nocapture [[DST:%.*]], i32 [[N:%.*]], i64 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -115,20 +110,15 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i64> [[TMP13]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 1, [[TMP16]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP17]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 8 x i64> [[TMP8]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP11]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
@@ -137,17 +127,17 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc <vscale x 8 x i64> [[TMP20]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <vscale x 8 x i64> [[TMP15]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP11]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -158,8 +148,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP24]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = trunc i64 [[SHR3]] to i8
 ; CHECK-NEXT:    store i8 [[CONV4]], ptr [[P_OUT_TAIL_09]], align 1
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 0c5394c..7ccbc98 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -3,6 +3,116 @@
 
 target triple = "riscv64"
 
+define void @trip1_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip1_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 1
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip3_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 3)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP9]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 3
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
 define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
 ; CHECK-LABEL: @trip5_i8(
 ; CHECK-NEXT:  entry:
@@ -33,7 +143,7 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -50,7 +160,7 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -74,4 +184,258 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-attributes #0 = { "target-features"="+v,+d" }
+define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip8_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP7]], i64 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP9]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 8 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP12]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 8 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 8
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip16_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP7]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 16
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip32_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP7]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 32
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip24_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP8]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP9]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 24
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { "target-features"="+v,+d" vscale_range(2, 1024) }
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
new file mode 100644
index 0000000..f982695
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -0,0 +1,26 @@
+; REQUIRES: asserts
+; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -S < %s
+; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -force-vector-width=4 -S < %s
+
+@h = global i64 0
+
+define void @test(ptr %p) {
+entry:
+  br label %for.body
+
+for.body:
+  %idx.ext.merge = phi i64 [ 1, %entry ], [ %idx, %for.body ]
+  %inc.merge = phi i16 [ 1, %entry ], [ %inc, %for.body ]
+  %idx.merge = phi i64 [ 0, %entry ], [ %idx.ext.merge, %for.body ]
+  %add = shl i64 %idx.merge, 1
+  %arrayidx = getelementptr i64, ptr %p, i64 %add
+  store i64 0, ptr %arrayidx
+  %inc = add i16 %inc.merge, 1
+  %idx = zext i16 %inc to i64
+  %gep = getelementptr i64, ptr %p, i64 %idx
+  %cmp = icmp ugt ptr %gep, @h
+  br i1 %cmp, label %exit, label %for.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/OpenMP/deduplication_soundness.ll b/llvm/test/Transforms/OpenMP/deduplication_soundness.ll
new file mode 100644
index 0000000..9dd3219
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/deduplication_soundness.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --scrub-attributes --filter "@omp_get_thread_limit|@use" --version 4
+; RUN: opt -passes=openmp-opt-cgscc -S < %s | FileCheck %s
+
+declare void @use(i32 noundef)
+declare i32 @omp_get_thread_limit()
+declare void @__kmpc_set_thread_limit(ptr, i32, i32)
+declare i32 @__kmpc_global_thread_num(ptr)
+declare noalias ptr @__kmpc_omp_task_alloc(ptr, i32, i32, i64, i64, ptr)
+declare void @__kmpc_omp_task_complete_if0(ptr, i32, ptr)
+declare void @__kmpc_omp_task_begin_if0(ptr, i32, ptr)
+
+%struct.ident_t = type { i32, i32, i32, i32, ptr }
+
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+
+define i32 @main() local_unnamed_addr {
+; CHECK-LABEL: define i32 @main() local_unnamed_addr {
+; CHECK:    [[CALL_I_I_I:%.*]] = call i32 @omp_get_thread_limit()
+; CHECK:    call void @use(i32 noundef [[CALL_I_I_I]])
+; CHECK:    [[CALL_I_I_I2:%.*]] = call i32 @omp_get_thread_limit()
+; CHECK:    call void @use(i32 noundef [[CALL_I_I_I2]])
+;
+entry:
+  %0 = call i32 @__kmpc_global_thread_num(ptr nonnull @1)
+  %1 = call ptr @__kmpc_omp_task_alloc(ptr nonnull @1, i32 %0, i32 1, i64 40, i64 0, ptr nonnull @.omp_task_entry.)
+  call void @__kmpc_omp_task_begin_if0(ptr nonnull @1, i32 %0, ptr %1)
+  call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 4)
+  %call.i.i.i = call i32 @omp_get_thread_limit()
+  call void @use(i32 noundef %call.i.i.i)
+  call void @__kmpc_omp_task_complete_if0(ptr nonnull @1, i32 %0, ptr %1)
+  %2 = call ptr @__kmpc_omp_task_alloc(ptr nonnull @1, i32 %0, i32 1, i64 40, i64 0, ptr nonnull @.omp_task_entry..2)
+  call void @__kmpc_omp_task_begin_if0(ptr nonnull @1, i32 %0, ptr %2)
+  call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 3)
+  %call.i.i.i2 = call i32 @omp_get_thread_limit()
+  call void @use(i32 noundef %call.i.i.i2)
+  call void @__kmpc_omp_task_complete_if0(ptr nonnull @1, i32 %0, ptr %2)
+  ret i32 0
+}
+
+define internal noundef i32 @.omp_task_entry.(i32 noundef %0, ptr noalias nocapture noundef readonly %1) {
+entry:
+  tail call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 4)
+  %call.i.i = tail call i32 @omp_get_thread_limit()
+  tail call void @use(i32 noundef %call.i.i)
+  ret i32 0
+}
+
+define internal noundef i32 @.omp_task_entry..2(i32 noundef %0, ptr noalias nocapture noundef readonly %1) {
+entry:
+  tail call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 3)
+  %call.i.i = tail call i32 @omp_get_thread_limit()
+  tail call void @use(i32 noundef %call.i.i)
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 7, !"openmp", i32 51}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 8f76b2e..44542f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -76,10 +76,10 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
@@ -107,12 +107,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
 ; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
@@ -152,12 +152,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
 ; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
@@ -166,9 +166,9 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
 ; CHECK:       while.end166:
 ; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
 ; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP55]], ptr [[CTF:%.*]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
index 0a68996..dc05967 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
@@ -6,7 +6,7 @@ define i32 @foo(i32 %v1, double %v2) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[V1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    br label [[FOR_COND15_PREHEADER:%.*]]
 ; CHECK:       for.cond15.preheader:
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
@@ -26,14 +26,15 @@ define i32 @foo(i32 %v1, double %v2) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP8]])
 ; CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
 ; CHECK:       sw.bb195:
 ; CHECK-NEXT:    br label [[SW_EPILOG]]
 ; CHECK:       do.body:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       sw.epilog:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP8]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP9]], [[SW_BB]] ]
 ; CHECK-NEXT:    ret i32 undef
 ; CHECK:       if.end.1:
 ; CHECK-NEXT:    br label [[FOR_COND15_1:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
index 28af0de..95aa40f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -20,17 +20,17 @@ define void @s116_modified(ptr %a) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
 ; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]]
-; CHECK-NEXT:    store <4 x float> [[TMP12]], ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 5707e14..89ea15d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -143,16 +143,17 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
 ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP2]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
-; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX163]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
+; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
+; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -183,19 +184,18 @@ define i32 @reorder_indices_1(float %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
 ; CHECK-NEXT:    [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; CHECK-NEXT:    store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[NOR1]], align 4
 ; CHECK-NEXT:    store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
index 9c7e8f6..cb24a9c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -68,10 +68,10 @@ define void @pr35497() local_unnamed_addr #0 {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; SSE-NEXT:    [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
 ; SSE-NEXT:    store <2 x i64> [[TMP5]], ptr undef, align 1
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
-; SSE-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
 ; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
 ; SSE-NEXT:    [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
 ; SSE-NEXT:    store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
@@ -88,10 +88,10 @@ define void @pr35497() local_unnamed_addr #0 {
 ; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
 ; AVX-NEXT:    store <2 x i64> [[TMP4]], ptr undef, align 1
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[ADD]], i32 0
-; AVX-NEXT:    [[TMP6:%.*]] = shl <2 x i64> [[TMP5]], <i64 2, i64 2>
-; AVX-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 20, i64 20>
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
 ; AVX-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
 ; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
 ; AVX-NEXT:    store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
index c051d90..ec90ca9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
@@ -18,9 +18,9 @@
 define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
 ; SSE2-LABEL: @reduce_and4(
 ; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
 ; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
 ; SSE2-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
@@ -28,9 +28,9 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
 ;
 ; SSE42-LABEL: @reduce_and4(
 ; SSE42-NEXT:  entry:
-; SSE42-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE42-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; SSE42-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE42-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
 ; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
 ; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
@@ -92,18 +92,18 @@ entry:
 
 define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
 ; SSE2-LABEL: @reduce_and4_transpose(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
 ; SSE2-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; SSE2-NEXT:    ret i32 [[OP_RDX1]]
 ;
 ; SSE42-LABEL: @reduce_and4_transpose(
-; SSE42-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE42-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE42-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE42-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
 ; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
index b553346..1a6ff23 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
@@ -17,13 +17,12 @@ define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 2, i32 0, i32 1, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP13]], false
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP12]], false
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
index f65f619..cd7ad21 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
@@ -8,12 +8,11 @@ define void @test(ptr noalias %0, ptr %p) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, <8 x ptr> [[TMP3]], <8 x i64> <i64 15, i64 4, i64 5, i64 0, i64 2, i64 6, i64 7, i64 8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 4, i32 3, i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 2, i32 5, i32 6, i32 7, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 24, i32 0, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 9, i32 0, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    store <16 x float> [[TMP11]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <16 x float> [[TMP10]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %2 = getelementptr inbounds float, ptr %p, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll
index af606fc..d3c9784 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll
@@ -6,7 +6,7 @@ define void @main(ptr %0) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[TMP0:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 1, i32 2, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 1, i32 2, i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = fcmp oeq <4 x double> [[TMP7]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index c79e9b9..fb2b653 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -12,10 +12,10 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
@@ -23,12 +23,11 @@ define void @test() {
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP1]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = fsub <2 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[ARRAYIDX21_I]], align 16
+; CHECK-NEXT:    store <2 x float> [[TMP14]], ptr [[ARRAYIDX21_I]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
index 8d1d257..9e3ba05 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
@@ -9,10 +9,10 @@ define void @foo(ptr %this, ptr %p, i32 %add7) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[ADD7:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <2 x i32> [[TMP0]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    switch i32 undef, label [[SW_EPILOG:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 2, label [[SW_BB]]
+; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> [[TMP1]], <i32 -1, i32 -1>
@@ -21,10 +21,11 @@ define void @foo(ptr %this, ptr %p, i32 %add7) {
 ; CHECK-NEXT:    br label [[SW_EPILOG]]
 ; CHECK:       sw.epilog:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP5]], [[SW_BB]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i32> undef, [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 9584a66..46cca9b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -182,19 +182,18 @@ define i32 @reorder_indices_1(float %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
 ; CHECK-NEXT:    [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; CHECK-NEXT:    store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[NOR1]], align 4
 ; CHECK-NEXT:    store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/SpeculativeExecution/PR46267.ll b/llvm/test/Transforms/SpeculativeExecution/PR46267.ll
index c27b492..d940ee6 100644
--- a/llvm/test/Transforms/SpeculativeExecution/PR46267.ll
+++ b/llvm/test/Transforms/SpeculativeExecution/PR46267.ll
@@ -41,12 +41,16 @@ land.rhs:                                         ; preds = %entry
 ; CHECK-NEXT: call void @llvm.dbg.declare(metadata ptr %y
 ; CHECK-NEXT: %a0 = load i32, ptr undef, align 1
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %a0
+; CHECK-NEXT: call void @llvm.dbg.label
   call void @llvm.dbg.label(metadata !11), !dbg !10
   %y = alloca i32, align 4
   call void @llvm.dbg.declare(metadata ptr %y, metadata !14, metadata !DIExpression()), !dbg !10
 
   %a0 = load i32, ptr undef, align 1
   call void @llvm.dbg.value(metadata i32 %a0, metadata !9, metadata !DIExpression()), !dbg !10
+  ;; RemoveDIs: Check a label that is attached to a hoisted instruction
+  ;; gets left behind (match intrinsic-style debug info behaviour).
+  call void @llvm.dbg.label(metadata !15), !dbg !10
 
   %a2 = add i32 %i, 0
   call void @llvm.dbg.value(metadata i32 %a2, metadata !13, metadata !DIExpression()), !dbg !10
@@ -82,3 +86,4 @@ attributes #1 = { nounwind readnone speculatable willreturn }
 !12 = !DILocalVariable(name: "x", scope: !6, file: !1, line: 3, type: !4)
 !13 = !DILocalVariable(name: "a2", scope: !6, file: !1, line: 3, type: !4)
 !14 = !DILocalVariable(name: "y", scope: !6, file: !1, line: 3, type: !4)
+!15 = !DILabel(scope: !6, name: "label2", file: !1, line: 2)
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll
new file mode 100644
index 0000000..0a43ad2
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv32 -mattr=+v | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+define void @fixed_load_scalable_src(ptr %p) {
+; CHECK-LABEL: define void @fixed_load_scalable_src(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <vscale x 4 x i16> zeroinitializer, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret void
+;
+entry:
+  store <vscale x 4 x i16> zeroinitializer, ptr %p
+  %0 = load <4 x i16>, ptr %p
+  %1 = shufflevector <4 x i16> %0, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret void
+}
diff --git a/llvm/test/tools/llc/new-pm/pipeline.mir b/llvm/test/tools/llc/new-pm/pipeline.mir
index c7dda4b..fcc7d4f 100644
--- a/llvm/test/tools/llc/new-pm/pipeline.mir
+++ b/llvm/test/tools/llc/new-pm/pipeline.mir
@@ -1,7 +1,6 @@
 # RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes=no-op-machine-function --print-pipeline-passes -filetype=null < %s | FileCheck %s --match-full-lines
 
-# CHECK: IR pipeline: PrintMIRPreparePass
-# CHECK: MIR pipeline: no-op-machine-function,print,FreeMachineFunctionPass
+# CHECK: machine-function(no-op-machine-function),PrintMIRPreparePass,machine-function(print,FreeMachineFunctionPass)
 
 ---
 name: f
diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll
index c25e45d..8c795a7 100644
--- a/llvm/test/tools/llc/new-pm/start-stop.ll
+++ b/llvm/test/tools/llc/new-pm/start-stop.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s | FileCheck --match-full-lines %s
-
-; CHECK: IR pipeline: function(mergeicmps,expand-memcmp,gc-lowering)
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s | FileCheck --match-full-lines %s --check-prefix=NULL
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -o /dev/null %s | FileCheck --match-full-lines %s --check-prefix=OBJ
 
+; NULL: function(mergeicmps,expand-memcmp,gc-lowering)
+; OBJ: function(mergeicmps,expand-memcmp,gc-lowering),PrintMIRPreparePass,machine-function(print)
diff --git a/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s
new file mode 100644
index 0000000..207822b
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=tsv110 --instruction-info=0 --resource-pressure=0 --timeline --iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd nobypass
+mul  x0, x1, x2
+add  x0, x0, x1
+add  x0, x0, x1
+add  x0, x0, x1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN madd bypass
+mul  x0, x1, x2
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - madd nobypass
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      10
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER  .   mul	x0, x1, x2
+# CHECK-NEXT: [0,1]     D====eER .   add	x0, x0, x1
+# CHECK-NEXT: [0,2]     D=====eER.   add	x0, x0, x1
+# CHECK-NEXT: [0,3]     D======eER   add	x0, x0, x1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       mul	x0, x1, x2
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x0, x1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       add	x0, x0, x1
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x0, x1
+# CHECK-NEXT:        1     4.8    0.3    0.0       <total>
+
+# CHECK:      [1] Code Region - madd bypass
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   . .   mul	x0, x1, x2
+# CHECK-NEXT: [0,1]     D=eeeeER  . .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D==eeeeER . .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     D======eeeeER   madd	x0, x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       mul	x0, x1, x2
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        1     3.3    0.3    0.0       <total>
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
new file mode 100644
index 0000000..f4c73de
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
@@ -0,0 +1,32 @@
+# UNSUPPORTED: zlib
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t
+
+# CHECK:      String dump of section '.a':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
+# CHECK-NEXT: [     0] .
+# CHECK-NEXT: [     8] .
+# CHECK-NEXT: [    10] .
+# CHECK-NEXT: [    18] x.c.
+# CHECK-NEXT: [    1e] .
+# CHECK-NEXT: [    20] .
+# CHECK-NEXT: Hex dump of section '.b':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
+# CHECK-NEXT: 0x00000000 01000000 00000000 01000000 00000000 ................
+# CHECK-NEXT: 0x00000010 01000000 00000000 789c6304 00000200 ........x.c.....
+# CHECK-NEXT: 0x00000020 02                                  .
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .a
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 010000000000000001000000000000000100000000000000789c63040000020002
+  - Name: .b
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 010000000000000001000000000000000100000000000000789c63040000020002
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
new file mode 100644
index 0000000..ea7a885
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
@@ -0,0 +1,76 @@
+# REQUIRES: zlib
+## Test --decompress/-z.
+
+# RUN: yaml2obj %s -o %t
+
+# RUN: llvm-readelf -z -x .strings -x .not_null_terminated %t | FileCheck %s --check-prefix=HEX
+# RUN: llvm-readobj --decompress -p .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=STR
+
+# HEX:      Hex dump of section '.strings':
+# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st
+# HEX-NEXT: 0x00000010 72696e67 7300                       rings.
+# HEX:      Hex dump of section '.not_null_terminated':
+# HEX-NEXT: 0x00000000 6e6f006e 756c6c                     no.null
+
+# STR:      String dump of section '.strings':
+# STR-NEXT: [ 0] here
+# STR-NEXT: [ 5] are
+# STR-NEXT: [ 9] some
+# STR-NEXT: [ e] strings
+# STR-EMPTY:
+# STR-NEXT: String dump of section '.not_null_terminated':
+# STR-NEXT: [ 0] no
+# STR-NEXT: [ 3] null{{$}}
+# STR-NOT:  {{.}}
+
+# RUN: llvm-readobj -x .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=COMPRESSED
+
+# COMPRESSED:      String dump of section '.not_null_terminated':
+# COMPRESSED-NEXT: [     0] no
+# COMPRESSED-NEXT: [     3] null
+# COMPRESSED-NEXT: Hex dump of section '.strings':
+# COMPRESSED-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
+# COMPRESSED-NEXT: 0x00000010 00000000 00000000 789ccb48 2d4a6548 ........x..H-JeH
+# COMPRESSED-NEXT: 0x00000020 04e2e2fc 5c205152 9499975e cc000058 ....\ QR...^...X
+# COMPRESSED-NEXT: 0x00000030 2e079b                              ...
+
+# RUN: llvm-readelf -z -p .invalid1 -x .invalid2 -x .invalid3 %t 2>&1 | FileCheck %s -DFILE=%t --check-prefix=INVALID
+
+# INVALID:      String dump of section '.invalid1':
+# INVALID-NEXT: warning: '[[FILE]]': corrupted compressed section header
+# INVALID-NEXT: [     0] .
+# INVALID-NEXT: Hex dump of section '.invalid2':
+# INVALID-NEXT: warning: '[[FILE]]': zlib error: Z_DATA_ERROR
+# INVALID-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
+# INVALID-NEXT: 0x00000010 00000000 00000000 78                ........x
+# INVALID-EMPTY:
+# INVALID-NEXT: Hex dump of section '.invalid3':
+# INVALID-NEXT: warning: '[[FILE]]': unsupported compression type (3)
+# INVALID-NEXT: 0x00000000 03000000 00000000 04000000 00000000 ................
+# INVALID-NEXT: 0x00000010 00000000 00000000 789c6360          ........x.c`
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .strings
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 010000000000000016000000000000000000000000000000789ccb482d4a654804e2e2fc5c2051529499975ecc0000582e079b
+  - Name: .not_null_terminated
+    Type: SHT_PROGBITS
+    Content: 6e6f006e756c6c
+  - Name: .invalid1
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 01
+  - Name: .invalid2
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 01000000000000001600000000000000000000000000000078
+  - Name: .invalid3
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 030000000000000004000000000000000000000000000000789c6360
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
new file mode 100644
index 0000000..65da952
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
@@ -0,0 +1,31 @@
+# UNSUPPORTED: zstd
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t
+
+# CHECK:      String dump of section '.a':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time
+# CHECK-NEXT: [     0] .
+# CHECK-NEXT: [     8] .
+# CHECK-NEXT: [    10] .
+# CHECK-NEXT: [    18] (./. ..
+# CHECK-NEXT: [    21] .
+# CHECK-NEXT: Hex dump of section '.b':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time
+# CHECK-NEXT: 0x00000000 02000000 00000000 01000000 00000000 ................
+# CHECK-NEXT: 0x00000010 01000000 00000000 28b52ffd 20010900 ........(./. ...
+# CHECK-NEXT: 0x00000020 0001                                ..
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .a
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001
+  - Name: .b
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test
new file mode 100644
index 0000000..519db87
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test
@@ -0,0 +1,28 @@
+# REQUIRES: zstd
+## Test --decompress/-z for zstd.
+
+# RUN: yaml2obj %s -o %t
+
+# RUN: llvm-readelf -z -x .strings %t | FileCheck %s --check-prefix=HEX
+# RUN: llvm-readobj --decompress -p .strings %t | FileCheck %s --check-prefix=STR
+
+# HEX:      Hex dump of section '.strings':
+# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st
+# HEX-NEXT: 0x00000010 72696e67 7300                       rings.
+
+# STR:      String dump of section '.strings':
+# STR-NEXT: [ 0] here
+# STR-NEXT: [ 5] are
+# STR-NEXT: [ 9] some
+# STR-NEXT: [ e] strings
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .strings
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 02000000000000001600000000000000000000000000000028b52ffd2016b10000686572650061726500736f6d6500737472696e677300
diff --git a/llvm/test/tools/llvm-symbolizer/wasm-basic.s b/llvm/test/tools/llvm-symbolizer/wasm-basic.s
index cc189ab..1f425e5 100644
--- a/llvm/test/tools/llvm-symbolizer/wasm-basic.s
+++ b/llvm/test/tools/llvm-symbolizer/wasm-basic.s
@@ -1,24 +1,59 @@
 # REQUIRES: webassembly-registered-target
 # RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj %s -o %t.o -g
+# RUN: llvm-symbolizer --basenames --output-style=GNU -e %t.o 1 2 3 4 5 6 7 8 9 10 11 12 13 | FileCheck %s
 
 foo:
     .functype foo () -> ()
     nop
+    return
     end_function
 
 bar:
     .functype bar (i32) -> (i32)
     local.get 0
+    nop
     return
     end_function
 
-# RUN: llvm-symbolizer -e %t.o 3 4 7 8 | FileCheck %s
-## Byte 1 is the function length and 2 is the locals declaration.
-## Currently no line corresponds to them.
-## TODO: create a loc for .functype?
 
-## Test 2 functions to ensure wasm's function-sections system works.
-# CHECK: wasm-basic.s:6:0
-# CHECK: wasm-basic.s:7:0
-# CHECK: wasm-basic.s:11:0
-# CHECK: wasm-basic.s:11:0
+## Symbols start from (including) the function length and should cover all the
+## way to the next symbol start.
+## TODO: create a loc for .functype? It could go with the local declarations.
+
+## Byte 1 is the function length, has no loc but the symbol table considers it
+## the start of the function
+# CHECK: foo
+# CHECK-NEXT: ??:0
+## Byte 2 is the local declaration, but for some reason DWARF is marking it as line 7.
+## TODO: figure out why.
+# CHECK-NEXT: foo
+# CHECK-NEXT: wasm-basic.s:7
+## Byte 3 is actually the nop, line 7
+# CHECK-NEXT: foo
+# CHECK-NEXT: wasm-basic.s:7
+## Byte 4 is the return, line 8
+# CHECK-NEXT: foo
+# CHECK-NEXT: wasm-basic.s:8
+## Byte 5 is the end_function, line 9
+# CHECK-NEXT: foo
+# CHECK-NEXT: wasm-basic.s:9
+## Byte 6 is bar's function length, symbol table considers it part of bar
+# CHECK-NEXT: bar
+# CHECK-NEXT: ??:0
+## Byte 7 bar's local declaration, but DWARF marks it as line 13, like above
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:13
+## Byte 8 and 9 are actually the local.get on line 13
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:13
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:13
+## Byte 10 is the nop
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:14
+## Byte b is the return
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:15
+## Byte c is end_function
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:16
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index c3288ef..6ae1b8d 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -89,30 +89,6 @@ bool LLCDiagnosticHandler::handleDiagnostics(const DiagnosticInfo &DI) {
 
 static llvm::ExitOnError ExitOnErr;
 
-static void RunPasses(bool BOS, ToolOutputFile *Out, Module *M,
-                      LLVMContext &Context, SmallString<0> &Buffer,
-                      ModulePassManager *MPM, ModuleAnalysisManager *MAM,
-                      MachineFunctionPassManager &MFPM,
-                      MachineFunctionAnalysisManager &MFAM) {
-  assert(M && "invalid input module!");
-
-  // Before executing passes, print the final values of the LLVM options.
-  cl::PrintOptionValues();
-
-  if (MPM) {
-    assert(MAM && "expect a ModuleAnalysisManager!");
-    MPM->run(*M, *MAM);
-  }
-
-  ExitOnErr(MFPM.run(*M, MFAM));
-
-  if (Context.getDiagHandlerPtr()->HasErrors)
-    exit(1);
-
-  if (BOS)
-    Out->os() << Buffer;
-}
-
 int llvm::compileModuleWithNewPM(
     StringRef Arg0, std::unique_ptr<Module> M, std::unique_ptr<MIRParser> MIR,
     std::unique_ptr<TargetMachine> Target, std::unique_ptr<ToolOutputFile> Out,
@@ -131,16 +107,6 @@ int llvm::compileModuleWithNewPM(
 
   raw_pwrite_stream *OS = &Out->os();
 
-  // Manually do the buffering rather than using buffer_ostream,
-  // so we can memcmp the contents in CompileTwice mode in future.
-  SmallString<0> Buffer;
-  std::unique_ptr<raw_svector_ostream> BOS;
-  if ((codegen::getFileType() != CodeGenFileType::AssemblyFile &&
-       !Out->os().supportsSeeking())) {
-    BOS = std::make_unique<raw_svector_ostream>(Buffer);
-    OS = BOS.get();
-  }
-
   // Fetch options from TargetPassConfig
   CGPassBuilderOption Opt = getCGPassBuilderOption();
   Opt.DisableVerify = NoVerify;
@@ -158,20 +124,19 @@ int llvm::compileModuleWithNewPM(
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
   ModuleAnalysisManager MAM;
+  MachineFunctionAnalysisManager MFAM;
   PassBuilder PB(Target.get(), PipelineTuningOptions(), std::nullopt, &PIC);
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
   PB.registerFunctionAnalyses(FAM);
   PB.registerLoopAnalyses(LAM);
-  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+  PB.registerMachineFunctionAnalyses(MFAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM);
 
   FAM.registerPass([&] { return TargetLibraryAnalysis(TLII); });
   MAM.registerPass([&] { return MachineModuleAnalysis(MMI); });
 
-  MachineFunctionAnalysisManager MFAM(FAM, MAM);
-
   ModulePassManager MPM;
-  MachineFunctionPassManager MFPM;
 
   if (!PassPipeline.empty()) {
     // Construct a custom pass pipeline that starts after instruction
@@ -182,49 +147,39 @@ int llvm::compileModuleWithNewPM(
       return 1;
     }
 
-    ExitOnErr(PB.parsePassPipeline(MFPM, PassPipeline));
+    // FIXME: verify that there are no IR passes.
+    ExitOnErr(PB.parsePassPipeline(MPM, PassPipeline));
     MPM.addPass(PrintMIRPreparePass(*OS));
+    MachineFunctionPassManager MFPM;
     MFPM.addPass(PrintMIRPass(*OS));
     MFPM.addPass(FreeMachineFunctionPass());
+    MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
 
-    auto &MMI = MFAM.getResult<MachineModuleAnalysis>(*M).getMMI();
     if (MIR->parseMachineFunctions(*M, MMI))
       return 1;
   } else {
-    ExitOnErr(LLVMTM.buildCodeGenPipeline(MPM, MFPM, MFAM, *OS,
-                                          DwoOut ? &DwoOut->os() : nullptr,
-                                          FileType, Opt, &PIC));
-
-    auto StartStopInfo = TargetPassConfig::getStartStopInfo(PIC);
-    assert(StartStopInfo && "Expect StartStopInfo!");
-
-    if (auto StopPassName = StartStopInfo->StopPass; !StopPassName.empty()) {
-      MFPM.addPass(PrintMIRPass(*OS));
-      MFPM.addPass(FreeMachineFunctionPass());
-    }
+    ExitOnErr(LLVMTM.buildCodeGenPipeline(
+        MPM, *OS, DwoOut ? &DwoOut->os() : nullptr, FileType, Opt, &PIC));
   }
 
   if (PrintPipelinePasses) {
-    std::string IRPipeline;
-    raw_string_ostream IRSOS(IRPipeline);
-    MPM.printPipeline(IRSOS, [&PIC](StringRef ClassName) {
-      auto PassName = PIC.getPassNameForClassName(ClassName);
-      return PassName.empty() ? ClassName : PassName;
-    });
-    outs() << "IR pipeline: " << IRPipeline << '\n';
-
-    std::string MIRPipeline;
-    raw_string_ostream MIRSOS(MIRPipeline);
-    MFPM.printPipeline(MIRSOS, [&PIC](StringRef ClassName) {
+    std::string PipelineStr;
+    raw_string_ostream OS(PipelineStr);
+    MPM.printPipeline(OS, [&PIC](StringRef ClassName) {
       auto PassName = PIC.getPassNameForClassName(ClassName);
       return PassName.empty() ? ClassName : PassName;
     });
-    outs() << "MIR pipeline: " << MIRPipeline << '\n';
+    outs() << PipelineStr << '\n';
     return 0;
   }
 
-  RunPasses(BOS.get(), Out.get(), M.get(), Context, Buffer, &MPM, &MAM, MFPM,
-            MFAM);
+  // Before executing passes, print the final values of the LLVM options.
+  cl::PrintOptionValues();
+
+  MPM.run(*M, MAM);
+
+  if (Context.getDiagHandlerPtr()->HasErrors)
+    exit(1);
 
   // Declare success.
   Out->keep();
diff --git a/llvm/tools/llvm-gsymutil/Opts.td b/llvm/tools/llvm-gsymutil/Opts.td
index 7402914..3aabc80 100644
--- a/llvm/tools/llvm-gsymutil/Opts.td
+++ b/llvm/tools/llvm-gsymutil/Opts.td
@@ -35,3 +35,6 @@ defm address : Eq<"address", "Lookup an address in a GSYM file">;
 def addresses_from_stdin :
   FF<"addresses-from-stdin",
      "Lookup addresses in a GSYM file that are read from stdin\nEach input line is expected to be of the following format: <addr> <gsym-path>">;
+defm json_summary_file :
+  Eq<"json-summary-file",
+     "Output a categorized summary of errors into the JSON file specified.">;
diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
index 2de9c76..00a24cd 100644
--- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
+++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/JSON.h"
 #include "llvm/Support/LLVMDriver.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -87,6 +88,7 @@ static std::vector<std::string> InputFilenames;
 static std::string ConvertFilename;
 static std::vector<std::string> ArchFilters;
 static std::string OutputFilename;
+static std::string JsonSummaryFile;
 static bool Verify;
 static unsigned NumThreads;
 static uint64_t SegmentSize;
@@ -138,6 +140,9 @@ static void parseArgs(int argc, char **argv) {
   if (const llvm::opt::Arg *A = Args.getLastArg(OPT_out_file_EQ))
     OutputFilename = A->getValue();
 
+  if (const llvm::opt::Arg *A = Args.getLastArg(OPT_json_summary_file_EQ))
+    JsonSummaryFile = A->getValue();
+
   Verify = Args.hasArg(OPT_verify);
 
   if (const llvm::opt::Arg *A = Args.getLastArg(OPT_num_threads_EQ)) {
@@ -515,10 +520,34 @@ int llvm_gsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
     // Call error() if we have an error and it will exit with a status of 1
     if (auto Err = convertFileToGSYM(Aggregation))
       error("DWARF conversion failed: ", std::move(Err));
+
     // Report the errors from aggregator:
     Aggregation.EnumerateResults([&](StringRef category, unsigned count) {
       OS << category << " occurred " << count << " time(s)\n";
     });
+    if (!JsonSummaryFile.empty()) {
+      std::error_code EC;
+      raw_fd_ostream JsonStream(JsonSummaryFile, EC, sys::fs::OF_Text);
+      if (EC) {
+        OS << "error opening aggregate error json file '" << JsonSummaryFile
+           << "' for writing: " << EC.message() << '\n';
+        return 1;
+      }
+
+      llvm::json::Object Categories;
+      uint64_t ErrorCount = 0;
+      Aggregation.EnumerateResults([&](StringRef Category, unsigned Count) {
+        llvm::json::Object Val;
+        Val.try_emplace("count", Count);
+        Categories.try_emplace(Category, std::move(Val));
+        ErrorCount += Count;
+      });
+      llvm::json::Object RootNode;
+      RootNode.try_emplace("error-categories", std::move(Categories));
+      RootNode.try_emplace("error-count", ErrorCount);
+
+      JsonStream << llvm::json::Value(std::move(RootNode));
+    }
     return 0;
   }
 
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index e6c219a..9e7f2c3 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -393,8 +393,16 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
   // Similar to some flags, internalization doesn't apply to the first file.
   bool InternalizeLinkedSymbols = false;
   for (const auto &File : Files) {
+    auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(File);
+
+    // When we encounter a missing file, make sure we expose its name.
+    if (auto EC = BufferOrErr.getError())
+      if (EC == std::errc::no_such_file_or_directory)
+        ExitOnErr(createStringError(EC, "No such file or directory: '%s'",
+                                    File.c_str()));
+
     std::unique_ptr<MemoryBuffer> Buffer =
-        ExitOnErr(errorOrToExpected(MemoryBuffer::getFileOrSTDIN(File)));
+        ExitOnErr(errorOrToExpected(std::move(BufferOrErr)));
 
     std::unique_ptr<Module> M =
         identify_magic(Buffer->getBuffer()) == file_magic::archive
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 59060ac..0d3fea7 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -14,6 +14,7 @@
 #include "ObjDumper.h"
 #include "llvm-readobj.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/Decompressor.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -142,8 +143,23 @@ getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
   return Ret;
 }
 
+static void maybeDecompress(const object::ObjectFile &Obj,
+                            StringRef SectionName, StringRef &SectionContent,
+                            SmallString<0> &Out) {
+  Expected<object::Decompressor> Decompressor = object::Decompressor::create(
+      SectionName, SectionContent, Obj.isLittleEndian(), Obj.is64Bit());
+  if (!Decompressor)
+    reportWarning(Decompressor.takeError(), Obj.getFileName());
+  else if (auto Err = Decompressor->resizeAndDecompress(Out))
+    reportWarning(std::move(Err), Obj.getFileName());
+  else
+    SectionContent = Out;
+}
+
 void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj,
-                                      ArrayRef<std::string> Sections) {
+                                      ArrayRef<std::string> Sections,
+                                      bool Decompress) {
+  SmallString<0> Out;
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
@@ -156,12 +172,16 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj,
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
+    if (Decompress && Section.isCompressed())
+      maybeDecompress(Obj, SectionName, SectionContent, Out);
     printAsStringList(SectionContent);
   }
 }
 
 void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
-                                   ArrayRef<std::string> Sections) {
+                                   ArrayRef<std::string> Sections,
+                                   bool Decompress) {
+  SmallString<0> Out;
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
@@ -174,6 +194,8 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
+    if (Decompress && Section.isCompressed())
+      maybeDecompress(Obj, SectionName, SectionContent, Out);
     const uint8_t *SecContent = SectionContent.bytes_begin();
     const uint8_t *SecEnd = SecContent + SectionContent.size();
 
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index 1d67945..3958dd3 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -175,9 +175,9 @@ public:
   void printAsStringList(StringRef StringContent, size_t StringDataOffset = 0);
 
   void printSectionsAsString(const object::ObjectFile &Obj,
-                             ArrayRef<std::string> Sections);
+                             ArrayRef<std::string> Sections, bool Decompress);
   void printSectionsAsHex(const object::ObjectFile &Obj,
-                          ArrayRef<std::string> Sections);
+                          ArrayRef<std::string> Sections, bool Decompress);
 
   std::function<Error(const Twine &Msg)> WarningHandler;
   void reportUniqueWarning(Error Err) const;
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index e2d93c6..018facc 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -20,6 +20,7 @@ def all : FF<"all", "Equivalent to setting: --file-header, --program-headers, --
 def arch_specific : FF<"arch-specific", "Display architecture-specific information">;
 def bb_addr_map : FF<"bb-addr-map", "Display the BB address map section">;
 def cg_profile : FF<"cg-profile", "Display call graph profile section">;
+def decompress : FF<"decompress", "Dump decompressed section content when used with -x or -p">;
 defm demangle : BB<"demangle", "Demangle symbol names", "Do not demangle symbol names (default)">;
 def dependent_libraries : FF<"dependent-libraries", "Display the dependent libraries section">;
 def dyn_relocations : FF<"dyn-relocations", "Display the dynamic relocation entries in the file">;
@@ -139,3 +140,4 @@ def : F<"u", "Alias for --unwind">, Alias<unwind>;
 def : F<"X", "Alias for --extra-sym-info">, Alias<extra_sym_info>, Group<grp_elf>;
 def : F<"V", "Alias for --version-info">, Alias<version_info>, Group<grp_elf>;
 def : JoinedOrSeparate<["-"], "x">, Alias<hex_dump_EQ>, HelpText<"Alias for --hex-dump">, MetaVarName<"<name or index>">;
+def : F<"z", "Alias for --decompress">, Alias<decompress>;
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index f9d605d..979433d 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -97,6 +97,7 @@ static bool ArchSpecificInfo;
 static bool BBAddrMap;
 bool ExpandRelocs;
 static bool CGProfile;
+static bool Decompress;
 bool Demangle;
 static bool DependentLibraries;
 static bool DynRelocs;
@@ -212,6 +213,7 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::ArchSpecificInfo = Args.hasArg(OPT_arch_specific);
   opts::BBAddrMap = Args.hasArg(OPT_bb_addr_map);
   opts::CGProfile = Args.hasArg(OPT_cg_profile);
+  opts::Decompress = Args.hasArg(OPT_decompress);
   opts::Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, false);
   opts::DependentLibraries = Args.hasArg(OPT_dependent_libraries);
   opts::DynRelocs = Args.hasArg(OPT_dyn_relocations);
@@ -439,9 +441,9 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
     Dumper->printSymbols(opts::Symbols, opts::DynamicSymbols,
                          opts::ExtraSymInfo, SymComp);
   if (!opts::StringDump.empty())
-    Dumper->printSectionsAsString(Obj, opts::StringDump);
+    Dumper->printSectionsAsString(Obj, opts::StringDump, opts::Decompress);
   if (!opts::HexDump.empty())
-    Dumper->printSectionsAsHex(Obj, opts::HexDump);
+    Dumper->printSectionsAsHex(Obj, opts::HexDump, opts::Decompress);
   if (opts::HashTable)
     Dumper->printHashTable();
   if (opts::GnuHashTable)
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 8104a32..9e0abe7 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -824,42 +824,7 @@ TEST_F(ValueTrackingTest, ComputeNumSignBits_Shuffle_Pointers) {
 TEST(ValueTracking, propagatesPoison) {
   std::string AsmHead =
       "declare i32 @g(i32)\n"
-      "declare {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare float @llvm.sqrt.f32(float)\n"
-      "declare float @llvm.powi.f32.i32(float, i32)\n"
-      "declare float @llvm.sin.f32(float)\n"
-      "declare float @llvm.cos.f32(float)\n"
-      "declare float @llvm.pow.f32(float, float)\n"
-      "declare float @llvm.exp.f32(float)\n"
-      "declare float @llvm.exp2.f32(float)\n"
-      "declare float @llvm.log.f32(float)\n"
-      "declare float @llvm.log10.f32(float)\n"
-      "declare float @llvm.log2.f32(float)\n"
-      "declare float @llvm.fma.f32(float, float, float)\n"
-      "declare float @llvm.fabs.f32(float)\n"
-      "declare float @llvm.minnum.f32(float, float)\n"
-      "declare float @llvm.maxnum.f32(float, float)\n"
-      "declare float @llvm.minimum.f32(float, float)\n"
-      "declare float @llvm.maximum.f32(float, float)\n"
-      "declare float @llvm.copysign.f32(float, float)\n"
-      "declare float @llvm.floor.f32(float)\n"
-      "declare float @llvm.ceil.f32(float)\n"
-      "declare float @llvm.trunc.f32(float)\n"
-      "declare float @llvm.rint.f32(float)\n"
-      "declare float @llvm.nearbyint.f32(float)\n"
-      "declare float @llvm.round.f32(float)\n"
-      "declare float @llvm.roundeven.f32(float)\n"
-      "declare i32 @llvm.lround.f32(float)\n"
-      "declare i64 @llvm.llround.f32(float)\n"
-      "declare i32 @llvm.lrint.f32(float)\n"
-      "declare i64 @llvm.llrint.f32(float)\n"
-      "declare float @llvm.fmuladd.f32(float, float, float)\n"
-      "define void @f(i32 %x, i32 %y, float %fx, float %fy, "
+      "define void @f(i32 %x, i32 %y, i32 %shamt, float %fx, float %fy, "
       "i1 %cond, ptr %p) {\n";
   std::string AsmTail = "  ret void\n}";
   // (propagates poison?, IR instruction)
@@ -912,6 +877,28 @@ TEST(ValueTracking, propagatesPoison) {
       {true, "call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)", 0},
       {true, "call {i32, i1} @llvm.usub.with.overflow.i32(i32 %x, i32 %y)", 0},
       {true, "call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.sshl.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.uadd.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.usub.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.ushl.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.ctpop.i32(i32 %x)", 0},
+      {true, "call i32 @llvm.ctlz.i32(i32 %x, i1 true)", 0},
+      {true, "call i32 @llvm.cttz.i32(i32 %x, i1 true)", 0},
+      {true, "call i32 @llvm.abs.i32(i32 %x, i1 true)", 0},
+      {true, "call i32 @llvm.smax.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.smin.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.umax.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.umin.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.bitreverse.i32(i32 %x)", 0},
+      {true, "call i32 @llvm.bswap.i32(i32 %x)", 0},
+      {false, "call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt)", 0},
+      {false, "call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt)", 1},
+      {false, "call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt)", 2},
+      {false, "call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt)", 0},
+      {false, "call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt)", 1},
+      {false, "call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt)", 2},
       {false, "call float @llvm.sqrt.f32(float %fx)", 0},
       {false, "call float @llvm.powi.f32.i32(float %fx, i32 %x)", 0},
       {false, "call float @llvm.sin.f32(float %fx)", 0},
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index 6140e0d..dbbacdd 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -40,6 +40,7 @@ add_llvm_unittest(CodeGenTests
   ScalableVectorMVTsTest.cpp
   SchedBoundary.cpp
   SelectionDAGAddressAnalysisTest.cpp
+  SelectionDAGPatternMatchTest.cpp
   TypeTraitsTest.cpp
   TargetOptionsTest.cpp
   TestAsmPrinter.cpp
diff --git a/llvm/unittests/CodeGen/PassManagerTest.cpp b/llvm/unittests/CodeGen/PassManagerTest.cpp
index 28003c2..4283eb0 100644
--- a/llvm/unittests/CodeGen/PassManagerTest.cpp
+++ b/llvm/unittests/CodeGen/PassManagerTest.cpp
@@ -5,13 +5,18 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// Test that the various MachineFunction pass managers, adaptors, analyses, and
+// analysis managers work.
+//===----------------------------------------------------------------------===//
 
+#include "llvm/IR/PassManager.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -34,14 +39,9 @@ public:
     int InstructionCount;
   };
 
-  /// Run the analysis pass over the function and return a result.
+  /// The number of instructions in the Function.
   Result run(Function &F, FunctionAnalysisManager &AM) {
-    int Count = 0;
-    for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI)
-      for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
-           ++II)
-        ++Count;
-    return Result(Count);
+    return Result(F.getInstructionCount());
   }
 
 private:
@@ -59,13 +59,12 @@ public:
     int InstructionCount;
   };
 
-  /// Run the analysis pass over the machine function and return a result.
-  Result run(MachineFunction &MF, MachineFunctionAnalysisManager::Base &AM) {
-    auto &MFAM = static_cast<MachineFunctionAnalysisManager &>(AM);
-    // Query function analysis result.
+  Result run(MachineFunction &MF, MachineFunctionAnalysisManager &AM) {
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
+            .getManager();
     TestFunctionAnalysis::Result &FAR =
-        MFAM.getResult<TestFunctionAnalysis>(MF.getFunction());
-    // + 5
+        FAM.getResult<TestFunctionAnalysis>(MF.getFunction());
     return FAR.InstructionCount;
   }
 
@@ -76,90 +75,54 @@ private:
 
 AnalysisKey TestMachineFunctionAnalysis::Key;
 
-const std::string DoInitErrMsg = "doInitialization failed";
-const std::string DoFinalErrMsg = "doFinalization failed";
-
 struct TestMachineFunctionPass : public PassInfoMixin<TestMachineFunctionPass> {
-  TestMachineFunctionPass(int &Count, std::vector<int> &BeforeInitialization,
-                          std::vector<int> &BeforeFinalization,
-                          std::vector<int> &MachineFunctionPassCount)
-      : Count(Count), BeforeInitialization(BeforeInitialization),
-        BeforeFinalization(BeforeFinalization),
-        MachineFunctionPassCount(MachineFunctionPassCount) {}
-
-  Error doInitialization(Module &M, MachineFunctionAnalysisManager &MFAM) {
-    // Force doInitialization fail by starting with big `Count`.
-    if (Count > 10000)
-      return make_error<StringError>(DoInitErrMsg, inconvertibleErrorCode());
-
-    // + 1
-    ++Count;
-    BeforeInitialization.push_back(Count);
-    return Error::success();
-  }
-  Error doFinalization(Module &M, MachineFunctionAnalysisManager &MFAM) {
-    // Force doFinalization fail by starting with big `Count`.
-    if (Count > 1000)
-      return make_error<StringError>(DoFinalErrMsg, inconvertibleErrorCode());
-
-    // + 1
-    ++Count;
-    BeforeFinalization.push_back(Count);
-    return Error::success();
-  }
+  TestMachineFunctionPass(int &Count, std::vector<int> &Counts)
+      : Count(Count), Counts(Counts) {}
 
   PreservedAnalyses run(MachineFunction &MF,
                         MachineFunctionAnalysisManager &MFAM) {
-    // Query function analysis result.
+    FunctionAnalysisManager &FAM =
+        MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
+            .getManager();
     TestFunctionAnalysis::Result &FAR =
-        MFAM.getResult<TestFunctionAnalysis>(MF.getFunction());
-    // 3 + 1 + 1 = 5
+        FAM.getResult<TestFunctionAnalysis>(MF.getFunction());
     Count += FAR.InstructionCount;
 
-    // Query module analysis result.
-    MachineModuleInfo &MMI =
-        MFAM.getResult<MachineModuleAnalysis>(*MF.getFunction().getParent())
-            .getMMI();
-    // 1 + 1 + 1 = 3
-    Count += (MMI.getModule() == MF.getFunction().getParent());
-
-    // Query machine function analysis result.
     TestMachineFunctionAnalysis::Result &MFAR =
         MFAM.getResult<TestMachineFunctionAnalysis>(MF);
-    // 3 + 1 + 1 = 5
     Count += MFAR.InstructionCount;
 
-    MachineFunctionPassCount.push_back(Count);
+    Counts.push_back(Count);
 
     return PreservedAnalyses::none();
   }
 
   int &Count;
-  std::vector<int> &BeforeInitialization;
-  std::vector<int> &BeforeFinalization;
-  std::vector<int> &MachineFunctionPassCount;
+  std::vector<int> &Counts;
 };
 
 struct TestMachineModulePass : public PassInfoMixin<TestMachineModulePass> {
-  TestMachineModulePass(int &Count, std::vector<int> &MachineModulePassCount)
-      : Count(Count), MachineModulePassCount(MachineModulePassCount) {}
-
-  Error run(Module &M, MachineFunctionAnalysisManager &MFAM) {
-    MachineModuleInfo &MMI = MFAM.getResult<MachineModuleAnalysis>(M).getMMI();
-    // + 1
-    Count += (MMI.getModule() == &M);
-    MachineModulePassCount.push_back(Count);
-    return Error::success();
-  }
-
-  PreservedAnalyses run(MachineFunction &MF,
-                        MachineFunctionAnalysisManager &AM) {
-    llvm_unreachable(
-        "This should never be reached because this is machine module pass");
+  TestMachineModulePass(int &Count, std::vector<int> &Counts)
+      : Count(Count), Counts(Counts) {}
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) {
+    MachineModuleInfo &MMI = MAM.getResult<MachineModuleAnalysis>(M).getMMI();
+    FunctionAnalysisManager &FAM =
+        MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+    MachineFunctionAnalysisManager &MFAM =
+        MAM.getResult<MachineFunctionAnalysisManagerModuleProxy>(M)
+            .getManager();
+    for (Function &F : M) {
+      MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+      Count += FAM.getResult<TestFunctionAnalysis>(F).InstructionCount;
+      Count += MFAM.getResult<TestMachineFunctionAnalysis>(MF).InstructionCount;
+    }
+    Counts.push_back(Count);
+    return PreservedAnalyses::all();
   }
 
   int &Count;
-  std::vector<int> &MachineModulePassCount;
+  std::vector<int> &Counts;
 };
 
 std::unique_ptr<Module> parseIR(LLVMContext &Context, const char *IR) {
@@ -211,102 +174,40 @@ TEST_F(PassManagerTest, Basic) {
   M->setDataLayout(TM->createDataLayout());
 
   MachineModuleInfo MMI(LLVMTM);
+
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
   ModuleAnalysisManager MAM;
+  MachineFunctionAnalysisManager MFAM;
   PassBuilder PB(TM.get());
   PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
   PB.registerFunctionAnalyses(FAM);
-  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.registerMachineFunctionAnalyses(MFAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM);
 
   FAM.registerPass([&] { return TestFunctionAnalysis(); });
-  FAM.registerPass([&] { return PassInstrumentationAnalysis(); });
   MAM.registerPass([&] { return MachineModuleAnalysis(MMI); });
-  MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
-
-  MachineFunctionAnalysisManager MFAM;
-  {
-    // Test move assignment.
-    MachineFunctionAnalysisManager NestedMFAM(FAM, MAM);
-    NestedMFAM.registerPass([&] { return PassInstrumentationAnalysis(); });
-    NestedMFAM.registerPass([&] { return TestMachineFunctionAnalysis(); });
-    MFAM = std::move(NestedMFAM);
-  }
+  MFAM.registerPass([&] { return TestMachineFunctionAnalysis(); });
 
   int Count = 0;
-  std::vector<int> BeforeInitialization[2];
-  std::vector<int> BeforeFinalization[2];
-  std::vector<int> TestMachineFunctionCount[2];
-  std::vector<int> TestMachineModuleCount[2];
+  std::vector<int> Counts;
 
+  ModulePassManager MPM;
   MachineFunctionPassManager MFPM;
-  {
-    // Test move assignment.
-    MachineFunctionPassManager NestedMFPM;
-    NestedMFPM.addPass(TestMachineModulePass(Count, TestMachineModuleCount[0]));
-    NestedMFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[0],
-                                               BeforeFinalization[0],
-                                               TestMachineFunctionCount[0]));
-    NestedMFPM.addPass(TestMachineModulePass(Count, TestMachineModuleCount[1]));
-    NestedMFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[1],
-                                               BeforeFinalization[1],
-                                               TestMachineFunctionCount[1]));
-    MFPM = std::move(NestedMFPM);
-  }
+  MPM.addPass(TestMachineModulePass(Count, Counts));
+  MPM.addPass(createModuleToMachineFunctionPassAdaptor(
+      TestMachineFunctionPass(Count, Counts)));
+  MPM.addPass(TestMachineModulePass(Count, Counts));
+  MFPM.addPass(TestMachineFunctionPass(Count, Counts));
+  MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+
+  MPM.run(*M, MAM);
 
-  ASSERT_FALSE(errorToBool(MFPM.run(*M, MFAM)));
-
-  // Check first machine module pass
-  EXPECT_EQ(1u, TestMachineModuleCount[0].size());
-  EXPECT_EQ(3, TestMachineModuleCount[0][0]);
-
-  // Check first machine function pass
-  EXPECT_EQ(1u, BeforeInitialization[0].size());
-  EXPECT_EQ(1, BeforeInitialization[0][0]);
-  EXPECT_EQ(3u, TestMachineFunctionCount[0].size());
-  EXPECT_EQ(10, TestMachineFunctionCount[0][0]);
-  EXPECT_EQ(13, TestMachineFunctionCount[0][1]);
-  EXPECT_EQ(16, TestMachineFunctionCount[0][2]);
-  EXPECT_EQ(1u, BeforeFinalization[0].size());
-  EXPECT_EQ(31, BeforeFinalization[0][0]);
-
-  // Check second machine module pass
-  EXPECT_EQ(1u, TestMachineModuleCount[1].size());
-  EXPECT_EQ(17, TestMachineModuleCount[1][0]);
-
-  // Check second machine function pass
-  EXPECT_EQ(1u, BeforeInitialization[1].size());
-  EXPECT_EQ(2, BeforeInitialization[1][0]);
-  EXPECT_EQ(3u, TestMachineFunctionCount[1].size());
-  EXPECT_EQ(24, TestMachineFunctionCount[1][0]);
-  EXPECT_EQ(27, TestMachineFunctionCount[1][1]);
-  EXPECT_EQ(30, TestMachineFunctionCount[1][2]);
-  EXPECT_EQ(1u, BeforeFinalization[1].size());
-  EXPECT_EQ(32, BeforeFinalization[1][0]);
-
-  EXPECT_EQ(32, Count);
-
-  // doInitialization returns error
-  Count = 10000;
-  MFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[1],
-                                       BeforeFinalization[1],
-                                       TestMachineFunctionCount[1]));
-  std::string Message;
-  llvm::handleAllErrors(MFPM.run(*M, MFAM), [&](llvm::StringError &Error) {
-    Message = Error.getMessage();
-  });
-  EXPECT_EQ(Message, DoInitErrMsg);
-
-  // doFinalization returns error
-  Count = 1000;
-  MFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[1],
-                                       BeforeFinalization[1],
-                                       TestMachineFunctionCount[1]));
-  llvm::handleAllErrors(MFPM.run(*M, MFAM), [&](llvm::StringError &Error) {
-    Message = Error.getMessage();
-  });
-  EXPECT_EQ(Message, DoFinalErrMsg);
+  EXPECT_EQ((std::vector<int>{10, 16, 18, 20, 30, 36, 38, 40}), Counts);
+  EXPECT_EQ(40, Count);
 }
 
 } // namespace
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
new file mode 100644
index 0000000..17fc3ce
--- /dev/null
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -0,0 +1,292 @@
+//===---- llvm/unittest/CodeGen/SelectionDAGPatternMatchTest.cpp  ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class SelectionDAGPatternMatchTest : public testing::Test {
+protected:
+  static void SetUpTestCase() {
+    InitializeAllTargets();
+    InitializeAllTargetMCs();
+  }
+
+  void SetUp() override {
+    StringRef Assembly = "@g = global i32 0\n"
+                         "@g_alias = alias i32, i32* @g\n"
+                         "define i32 @f() {\n"
+                         "  %1 = load i32, i32* @g\n"
+                         "  ret i32 %1\n"
+                         "}";
+
+    Triple TargetTriple("riscv64--");
+    std::string Error;
+    const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
+    // FIXME: These tests do not depend on RISCV specifically, but we have to
+    // initialize a target. A skeleton Target for unittests would allow us to
+    // always run these tests.
+    if (!T)
+      GTEST_SKIP();
+
+    TargetOptions Options;
+    TM = std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine *>(
+        T->createTargetMachine("riscv64", "", "+m,+f,+d,+v", Options,
+                               std::nullopt, std::nullopt,
+                               CodeGenOptLevel::Aggressive)));
+    if (!TM)
+      GTEST_SKIP();
+
+    SMDiagnostic SMError;
+    M = parseAssemblyString(Assembly, SMError, Context);
+    if (!M)
+      report_fatal_error(SMError.getMessage());
+    M->setDataLayout(TM->createDataLayout());
+
+    F = M->getFunction("f");
+    if (!F)
+      report_fatal_error("F?");
+    G = M->getGlobalVariable("g");
+    if (!G)
+      report_fatal_error("G?");
+    AliasedG = M->getNamedAlias("g_alias");
+    if (!AliasedG)
+      report_fatal_error("AliasedG?");
+
+    MachineModuleInfo MMI(TM.get());
+
+    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
+                                           0, MMI);
+
+    DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
+    if (!DAG)
+      report_fatal_error("DAG?");
+    OptimizationRemarkEmitter ORE(F);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
+  }
+
+  TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
+    return DAG->getTargetLoweringInfo().getTypeAction(Context, VT);
+  }
+
+  EVT getTypeToTransformTo(EVT VT) {
+    return DAG->getTargetLoweringInfo().getTypeToTransformTo(Context, VT);
+  }
+
+  LLVMContext Context;
+  std::unique_ptr<LLVMTargetMachine> TM;
+  std::unique_ptr<Module> M;
+  Function *F;
+  GlobalVariable *G;
+  GlobalAlias *AliasedG;
+  std::unique_ptr<MachineFunction> MF;
+  std::unique_ptr<SelectionDAG> DAG;
+};
+
+TEST_F(SelectionDAGPatternMatchTest, matchValueType) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto Float32VT = EVT::getFloatingPointVT(32);
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Float32VT);
+  SDValue Op2 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, VInt32VT);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Op0, m_SpecificVT(Int32VT)));
+  EVT BindVT;
+  EXPECT_TRUE(sd_match(Op1, m_VT(BindVT)));
+  EXPECT_EQ(BindVT, Float32VT);
+  EXPECT_TRUE(sd_match(Op0, m_IntegerVT()));
+  EXPECT_TRUE(sd_match(Op1, m_FloatingPointVT()));
+  EXPECT_TRUE(sd_match(Op2, m_VectorVT()));
+  EXPECT_FALSE(sd_match(Op2, m_ScalableVectorVT()));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto Float32VT = EVT::getFloatingPointVT(32);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT);
+  SDValue Op2 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, Float32VT);
+
+  SDValue Add = DAG->getNode(ISD::ADD, DL, Int32VT, Op0, Op1);
+  SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Add, Op0);
+  SDValue Mul = DAG->getNode(ISD::MUL, DL, Int32VT, Add, Sub);
+
+  SDValue SFAdd = DAG->getNode(ISD::STRICT_FADD, DL, {Float32VT, MVT::Other},
+                               {DAG->getEntryNode(), Op2, Op2});
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Sub, m_BinOp(ISD::SUB, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Sub, m_Sub(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Add, m_c_BinOp(ISD::ADD, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Add, m_Add(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(
+      Mul, m_Mul(m_OneUse(m_Opc(ISD::SUB)), m_NUses<2>(m_Specific(Add)))));
+  EXPECT_TRUE(
+      sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_SpecificVT(Float32VT),
+                                     m_SpecificVT(Float32VT))));
+  SDValue BindVal;
+  EXPECT_TRUE(sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_Value(BindVal),
+                                             m_Deferred(BindVal))));
+  EXPECT_FALSE(sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_OtherVT(),
+                                              m_SpecificVT(Float32VT))));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto Int64VT = EVT::getIntegerVT(Context, 64);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int64VT);
+
+  SDValue ZExt = DAG->getNode(ISD::ZERO_EXTEND, DL, Int64VT, Op0);
+  SDValue SExt = DAG->getNode(ISD::SIGN_EXTEND, DL, Int64VT, Op0);
+  SDValue Trunc = DAG->getNode(ISD::TRUNCATE, DL, Int32VT, Op1);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value())));
+  EXPECT_TRUE(sd_match(SExt, m_SExt(m_Value())));
+  EXPECT_TRUE(sd_match(Trunc, m_Trunc(m_Specific(Op1))));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+
+  SDValue Arg0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+
+  SDValue Const3 = DAG->getConstant(3, DL, Int32VT);
+  SDValue Const87 = DAG->getConstant(87, DL, Int32VT);
+  SDValue Splat = DAG->getSplat(VInt32VT, DL, Arg0);
+  SDValue ConstSplat = DAG->getSplat(VInt32VT, DL, Const3);
+  SDValue Zero = DAG->getConstant(0, DL, Int32VT);
+  SDValue One = DAG->getConstant(1, DL, Int32VT);
+  SDValue AllOnes = DAG->getConstant(APInt::getAllOnes(32), DL, Int32VT);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Const87, m_ConstInt()));
+  EXPECT_FALSE(sd_match(Arg0, m_ConstInt()));
+  APInt ConstVal;
+  EXPECT_TRUE(sd_match(ConstSplat, m_ConstInt(ConstVal)));
+  EXPECT_EQ(ConstVal, 3);
+  EXPECT_FALSE(sd_match(Splat, m_ConstInt()));
+
+  EXPECT_TRUE(sd_match(Const87, m_SpecificInt(87)));
+  EXPECT_TRUE(sd_match(Const3, m_SpecificInt(ConstVal)));
+  EXPECT_TRUE(sd_match(AllOnes, m_AllOnes()));
+
+  EXPECT_TRUE(sd_match(Zero, DAG.get(), m_False()));
+  EXPECT_TRUE(sd_match(One, DAG.get(), m_True()));
+  EXPECT_FALSE(sd_match(AllOnes, DAG.get(), m_True()));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, patternCombinators) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT);
+
+  SDValue Add = DAG->getNode(ISD::ADD, DL, Int32VT, Op0, Op1);
+  SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Add, Op0);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(
+      Sub, m_AnyOf(m_Opc(ISD::ADD), m_Opc(ISD::SUB), m_Opc(ISD::MUL))));
+  EXPECT_TRUE(sd_match(Add, m_AllOf(m_Opc(ISD::ADD), m_OneUse())));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchNode) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT);
+
+  SDValue Add = DAG->getNode(ISD::ADD, DL, Int32VT, Op0, Op1);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Add, m_Node(ISD::ADD, m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Add, m_Node(ISD::SUB, m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Add, m_Node(ISD::ADD, m_Value())));
+  EXPECT_FALSE(
+      sd_match(Add, m_Node(ISD::ADD, m_Value(), m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Add, m_Node(ISD::ADD, m_ConstInt(), m_Value())));
+}
+
+namespace {
+struct VPMatchContext : public SDPatternMatch::BasicMatchContext {
+  using SDPatternMatch::BasicMatchContext::BasicMatchContext;
+
+  bool match(SDValue OpVal, unsigned Opc) const {
+    if (!OpVal->isVPOpcode())
+      return OpVal->getOpcode() == Opc;
+
+    auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(), false);
+    return BaseOpc.has_value() && *BaseOpc == Opc;
+  }
+};
+} // anonymous namespace
+TEST_F(SelectionDAGPatternMatchTest, matchContext) {
+  SDLoc DL;
+  auto BoolVT = EVT::getIntegerVT(Context, 1);
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+  auto MaskVT = EVT::getVectorVT(Context, BoolVT, 4);
+
+  SDValue Scalar0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Vector0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, VInt32VT);
+  SDValue Mask0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, MaskVT);
+
+  SDValue VPAdd = DAG->getNode(ISD::VP_ADD, DL, VInt32VT,
+                               {Vector0, Vector0, Mask0, Scalar0});
+  SDValue VPReduceAdd = DAG->getNode(ISD::VP_REDUCE_ADD, DL, Int32VT,
+                                     {Scalar0, VPAdd, Mask0, Scalar0});
+
+  using namespace SDPatternMatch;
+  VPMatchContext VPCtx(DAG.get());
+  EXPECT_TRUE(sd_context_match(VPAdd, VPCtx, m_Opc(ISD::ADD)));
+  // VP_REDUCE_ADD doesn't have a based opcode, so we use a normal
+  // sd_match before switching to VPMatchContext when checking VPAdd.
+  EXPECT_TRUE(sd_match(VPReduceAdd, m_Node(ISD::VP_REDUCE_ADD, m_Value(),
+                                           m_Context(VPCtx, m_Opc(ISD::ADD)),
+                                           m_Value(), m_Value())));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchAdvancedProperties) {
+  SDLoc DL;
+  auto Int16VT = EVT::getIntegerVT(Context, 16);
+  auto Int64VT = EVT::getIntegerVT(Context, 64);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int64VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int16VT);
+
+  SDValue Add = DAG->getNode(ISD::ADD, DL, Int64VT, Op0, Op0);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Op0, DAG.get(), m_LegalType(m_Value())));
+  EXPECT_FALSE(sd_match(Op1, DAG.get(), m_LegalType(m_Value())));
+  EXPECT_TRUE(sd_match(Add, DAG.get(),
+                       m_LegalOp(m_IntegerVT(m_Add(m_Value(), m_Value())))));
+}
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
index d42a626..980b627 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
@@ -823,7 +823,9 @@ TEST_F(DebugLineBasicFixture, ErrorForUnsupportedAddressSizeDefinedInHeader) {
                                                     nullptr, RecordRecoverable);
   EXPECT_THAT_ERROR(
       std::move(Recoverable),
-      FailedWithMessage("address size 0x09 of DW_LNE_set_address opcode at "
+      FailedWithMessage("parsing line table prologue at offset 0x00000000: "
+                        "invalid address size 9",
+                        "address size 0x09 of DW_LNE_set_address opcode at "
                         "offset 0x00000038 is unsupported"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u);
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index 7a9d91c..e3462f0 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -163,6 +163,14 @@ protected:
         << "(<8 x i16>, i1 immarg, <8 x i1>, i32) ";
     Str << " declare <8 x i16> @llvm.vp.cttz.v8i16"
         << "(<8 x i16>, i1 immarg, <8 x i1>, i32) ";
+    Str << " declare <8 x i16> @llvm.vp.sadd.sat.v8i16"
+        << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) ";
+    Str << " declare <8 x i16> @llvm.vp.uadd.sat.v8i16"
+        << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) ";
+    Str << " declare <8 x i16> @llvm.vp.ssub.sat.v8i16"
+        << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) ";
+    Str << " declare <8 x i16> @llvm.vp.usub.sat.v8i16"
+        << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) ";
     Str << " declare <8 x i16> @llvm.vp.fshl.v8i16"
         << "(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i1>, i32) ";
     Str << " declare <8 x i16> @llvm.vp.fshr.v8i16"
diff --git a/llvm/unittests/MIR/CMakeLists.txt b/llvm/unittests/MIR/CMakeLists.txt
index f485dcb..0ad5213 100644
--- a/llvm/unittests/MIR/CMakeLists.txt
+++ b/llvm/unittests/MIR/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  Analysis
   CodeGen
   CodeGenTypes
   Core
diff --git a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
index 8ecde22..8e3738d 100644
--- a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
+++ b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/CodeGen/FreeMachineFunction.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Testing/Support/Error.h"
@@ -96,8 +99,6 @@ MATCHER_P(HasNameRegex, Name, "") {
 }
 
 struct MockPassInstrumentationCallbacks {
-  PassInstrumentationCallbacks Callbacks;
-
   MockPassInstrumentationCallbacks() {
     ON_CALL(*this, runBeforePass(_, _)).WillByDefault(Return(true));
   }
@@ -111,7 +112,7 @@ struct MockPassInstrumentationCallbacks {
   MOCK_METHOD2(runBeforeAnalysis, void(StringRef PassID, llvm::Any));
   MOCK_METHOD2(runAfterAnalysis, void(StringRef PassID, llvm::Any));
 
-  void registerPassInstrumentation() {
+  void registerPassInstrumentation(PassInstrumentationCallbacks &Callbacks) {
     Callbacks.registerShouldRunOptionalPassCallback(
         [this](StringRef P, llvm::Any IR) {
           return this->runBeforePass(P, IR);
@@ -147,7 +148,8 @@ struct MockPassInstrumentationCallbacks {
     // to check these explicitly.
     EXPECT_CALL(*this,
                 runBeforePass(Not(HasNameRegex("Mock")), HasName(IRName)))
-        .Times(AnyNumber());
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(false));
     EXPECT_CALL(
         *this, runBeforeSkippedPass(Not(HasNameRegex("Mock")), HasName(IRName)))
         .Times(AnyNumber());
@@ -157,15 +159,9 @@ struct MockPassInstrumentationCallbacks {
     EXPECT_CALL(*this,
                 runAfterPass(Not(HasNameRegex("Mock")), HasName(IRName), _))
         .Times(AnyNumber());
-    EXPECT_CALL(*this, runBeforeAnalysis(HasNameRegex("MachineModuleAnalysis"),
-                                         HasName(IRName)))
-        .Times(AnyNumber());
     EXPECT_CALL(*this,
                 runBeforeAnalysis(Not(HasNameRegex("Mock")), HasName(IRName)))
         .Times(AnyNumber());
-    EXPECT_CALL(*this, runAfterAnalysis(HasNameRegex("MachineModuleAnalysis"),
-                                        HasName(IRName)))
-        .Times(AnyNumber());
     EXPECT_CALL(*this,
                 runAfterAnalysis(Not(HasNameRegex("Mock")), HasName(IRName)))
         .Times(AnyNumber());
@@ -202,7 +198,7 @@ public:
       }
     };
 
-    Result run(MachineFunction &IR, MachineFunctionAnalysisManager::Base &AM) {
+    Result run(MachineFunction &IR, MachineFunctionAnalysisManager &AM) {
       return Handle->run(IR, AM);
     }
   };
@@ -249,7 +245,7 @@ public:
 
   public:
     PreservedAnalyses run(MachineFunction &IR,
-                          MachineFunctionAnalysisManager::Base &AM) {
+                          MachineFunctionAnalysisManager &AM) {
       return Handle->run(IR, AM);
     }
   };
@@ -270,7 +266,7 @@ protected:
 
 struct MockAnalysisHandle : public MockAnalysisHandleBase<MockAnalysisHandle> {
   MOCK_METHOD2(run, Analysis::Result(MachineFunction &,
-                                     MachineFunctionAnalysisManager::Base &));
+                                     MachineFunctionAnalysisManager &));
 
   MOCK_METHOD3(invalidate, bool(MachineFunction &, const PreservedAnalyses &,
                                 MachineFunctionAnalysisManager::Invalidator &));
@@ -284,7 +280,7 @@ AnalysisKey MockAnalysisHandleBase<DerivedT>::Analysis::Key;
 class MockPassHandle : public MockPassHandleBase<MockPassHandle> {
 public:
   MOCK_METHOD2(run, PreservedAnalyses(MachineFunction &,
-                                      MachineFunctionAnalysisManager::Base &));
+                                      MachineFunctionAnalysisManager &));
 
   MockPassHandle() { setDefaults(); }
 };
@@ -297,50 +293,51 @@ protected:
     InitializeAllTargetMCs();
   }
 
+  LLVMContext Context;
+
   std::unique_ptr<LLVMTargetMachine> TM;
   std::unique_ptr<MachineModuleInfo> MMI;
 
-  LLVMContext Context;
   std::unique_ptr<Module> M;
-  std::unique_ptr<MIRParser> MIR;
-
-  MockPassInstrumentationCallbacks CallbacksHandle;
 
-  PassBuilder PB;
-  ModulePassManager PM;
-  MachineFunctionPassManager MFPM;
-  FunctionAnalysisManager FAM;
-  ModuleAnalysisManager AM;
+  PassInstrumentationCallbacks PIC;
+  std::unique_ptr<PassBuilder> PB;
+  ModulePassManager MPM;
   MachineFunctionAnalysisManager MFAM;
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
 
+  MockPassInstrumentationCallbacks CallbacksHandle;
   MockPassHandle PassHandle;
   MockAnalysisHandle AnalysisHandle;
 
-  std::unique_ptr<Module> parseMIR(const TargetMachine &TM, StringRef MIRCode,
-                                   MachineModuleInfo &MMI) {
+  static std::unique_ptr<Module> parseMIR(StringRef MIRCode,
+                                          LLVMContext &Context,
+                                          TargetMachine &TM,
+                                          MachineModuleInfo &MMI) {
     SMDiagnostic Diagnostic;
     std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
-    MIR = createMIRParser(std::move(MBuffer), Context);
-    if (!MIR)
-      return nullptr;
+    std::unique_ptr<MIRParser> MIR =
+        createMIRParser(std::move(MBuffer), Context);
+    assert(MIR);
 
     std::unique_ptr<Module> Mod = MIR->parseIRModule();
-    if (!Mod)
-      return nullptr;
+    assert(Mod);
 
+    // Module identifier is used in tests below.
+    Mod->setModuleIdentifier("module");
     Mod->setDataLayout(TM.createDataLayout());
 
-    if (MIR->parseMachineFunctions(*Mod, MMI)) {
-      M.reset();
-      return nullptr;
-    }
+    [[maybe_unused]] bool Ret = MIR->parseMachineFunctions(*Mod, MMI);
+    assert(!Ret);
+
     return Mod;
   }
 
   static PreservedAnalyses
-  getAnalysisResult(MachineFunction &U,
-                    MachineFunctionAnalysisManager::Base &AM) {
-    auto &MFAM = static_cast<MachineFunctionAnalysisManager &>(AM);
+  getAnalysisResult(MachineFunction &U, MachineFunctionAnalysisManager &MFAM) {
     MFAM.getResult<MockAnalysisHandle::Analysis>(U);
     return PreservedAnalyses::all();
   }
@@ -356,25 +353,18 @@ protected:
             TripleName, "", "", TargetOptions(), std::nullopt)));
     if (!TM)
       GTEST_SKIP();
-    MMI = std::make_unique<MachineModuleInfo>(TM.get());
-    M = parseMIR(*TM, MIRString, *MMI);
-    AM.registerPass([&] { return MachineModuleAnalysis(*MMI); });
-  }
 
-  MachineFunctionCallbacksTest()
-      : CallbacksHandle(), PB(nullptr, PipelineTuningOptions(), std::nullopt,
-                              &CallbacksHandle.Callbacks),
-        PM(), FAM(), AM(), MFAM(FAM, AM) {
-
-    EXPECT_TRUE(&CallbacksHandle.Callbacks ==
-                PB.getPassInstrumentationCallbacks());
+    MMI = std::make_unique<MachineModuleInfo>(TM.get());
+    M = parseMIR(MIRString, Context, *TM, *MMI);
+    PB = std::make_unique<PassBuilder>(TM.get(), PipelineTuningOptions(),
+                                       std::nullopt, &PIC);
 
     /// Register a callback for analysis registration.
     ///
     /// The callback is a function taking a reference to an AnalyisManager
     /// object. When called, the callee gets to register its own analyses with
     /// this PassBuilder instance.
-    PB.registerAnalysisRegistrationCallback(
+    PB->registerAnalysisRegistrationCallback(
         [this](MachineFunctionAnalysisManager &AM) {
           // Register our mock analysis
           AM.registerPass([this] { return AnalysisHandle.getAnalysis(); });
@@ -386,24 +376,29 @@ protected:
     /// callbacks for each encountered pass name that it does not know. This
     /// includes both simple pass names as well as names of sub-pipelines. In
     /// the latter case, the InnerPipeline is not empty.
-    PB.registerPipelineParsingCallback(
-        [this](StringRef Name, MachineFunctionPassManager &PM) {
+    PB->registerPipelineParsingCallback(
+        [this](StringRef Name, MachineFunctionPassManager &PM,
+               ArrayRef<PassBuilder::PipelineElement> InnerPipeline) {
           if (parseAnalysisUtilityPasses<MockAnalysisHandle::Analysis>(
                   "test-analysis", Name, PM))
             return true;
 
           /// Parse the name of our pass mock handle
           if (Name == "test-transform") {
-            MFPM.addPass(PassHandle.getPass());
+            PM.addPass(PassHandle.getPass());
             return true;
           }
           return false;
         });
 
     /// Register builtin analyses and cross-register the analysis proxies
-    PB.registerModuleAnalyses(AM);
-    PB.registerFunctionAnalyses(FAM);
-    PB.registerMachineFunctionAnalyses(MFAM);
+    PB->registerModuleAnalyses(MAM);
+    PB->registerCGSCCAnalyses(CGAM);
+    PB->registerFunctionAnalyses(FAM);
+    PB->registerLoopAnalyses(LAM);
+    PB->registerMachineFunctionAnalyses(MFAM);
+    PB->crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM);
+    MAM.registerPass([&] { return MachineModuleAnalysis(*MMI); });
   }
 };
 
@@ -412,53 +407,58 @@ TEST_F(MachineFunctionCallbacksTest, Passes) {
   EXPECT_CALL(PassHandle, run(HasName("test"), _)).WillOnce(&getAnalysisResult);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(MFPM, PipelineText), Succeeded())
+  ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded())
       << "Pipeline was: " << PipelineText;
-  ASSERT_THAT_ERROR(MFPM.run(*M, MFAM), Succeeded());
+  MPM.run(*M, MAM);
 }
 
 TEST_F(MachineFunctionCallbacksTest, InstrumentedPasses) {
-  CallbacksHandle.registerPassInstrumentation();
+  CallbacksHandle.registerPassInstrumentation(PIC);
   // Non-mock instrumentation not specifically mentioned below can be ignored.
-  CallbacksHandle.ignoreNonMockPassInstrumentation("<string>");
   CallbacksHandle.ignoreNonMockPassInstrumentation("test");
-  CallbacksHandle.ignoreNonMockPassInstrumentation("");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
 
   // PassInstrumentation calls should happen in-sequence, in the same order
   // as passes/analyses are scheduled.
   ::testing::Sequence PISequence;
   EXPECT_CALL(CallbacksHandle,
               runBeforePass(HasNameRegex("MockPassHandle"), HasName("test")))
-      .InSequence(PISequence);
+      .InSequence(PISequence)
+      .WillOnce(Return(true));
   EXPECT_CALL(
       CallbacksHandle,
       runBeforeNonSkippedPass(HasNameRegex("MockPassHandle"), HasName("test")))
       .InSequence(PISequence);
-  EXPECT_CALL(CallbacksHandle,
-              runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), _))
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("test")))
       .InSequence(PISequence);
-  EXPECT_CALL(CallbacksHandle,
-              runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), _))
+  EXPECT_CALL(
+      CallbacksHandle,
+      runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("test")))
       .InSequence(PISequence);
   EXPECT_CALL(CallbacksHandle,
               runAfterPass(HasNameRegex("MockPassHandle"), HasName("test"), _))
       .InSequence(PISequence);
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforeSkippedPass(HasNameRegex("MockPassHandle"), HasName("test")))
+      .Times(0);
 
   EXPECT_CALL(AnalysisHandle, run(HasName("test"), _));
   EXPECT_CALL(PassHandle, run(HasName("test"), _)).WillOnce(&getAnalysisResult);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(MFPM, PipelineText), Succeeded())
+  ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded())
       << "Pipeline was: " << PipelineText;
-  ASSERT_THAT_ERROR(MFPM.run(*M, MFAM), Succeeded());
+  MPM.run(*M, MAM);
 }
 
 TEST_F(MachineFunctionCallbacksTest, InstrumentedSkippedPasses) {
-  CallbacksHandle.registerPassInstrumentation();
+  CallbacksHandle.registerPassInstrumentation(PIC);
   // Non-mock instrumentation run here can safely be ignored.
-  CallbacksHandle.ignoreNonMockPassInstrumentation("<string>");
   CallbacksHandle.ignoreNonMockPassInstrumentation("test");
-  CallbacksHandle.ignoreNonMockPassInstrumentation("");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
 
   // Skip the pass by returning false.
   EXPECT_CALL(CallbacksHandle,
@@ -495,9 +495,81 @@ TEST_F(MachineFunctionCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(MFPM, PipelineText), Succeeded())
+  ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded())
       << "Pipeline was: " << PipelineText;
-  ASSERT_THAT_ERROR(MFPM.run(*M, MFAM), Succeeded());
+  MPM.run(*M, MAM);
+}
+
+// Check that the Module -> MachineFunction adaptor properly calls
+// runAfterPassInvalidated.
+TEST_F(MachineFunctionCallbacksTest, InstrumentedFreeMFPass) {
+  CallbacksHandle.registerPassInstrumentation(PIC);
+  // Non-mock instrumentation run here can safely be ignored.
+  CallbacksHandle.ignoreNonMockPassInstrumentation("test");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
+
+  ::testing::Sequence PISequence;
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforePass(HasNameRegex("FreeMachineFunctionPass"), HasName("test")))
+      .InSequence(PISequence)
+      .WillOnce(Return(true));
+  EXPECT_CALL(CallbacksHandle,
+              runBeforeNonSkippedPass(HasNameRegex("FreeMachineFunctionPass"),
+                                      HasName("test")))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle, runAfterPassInvalidated(
+                                   HasNameRegex("FreeMachineFunctionPass"), _))
+      .InSequence(PISequence);
+
+  // runAfterPass should not be called since the MachineFunction is no longer
+  // valid after FreeMachineFunctionPass.
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPass(HasNameRegex("FreeMachineFunctionPass"), _, _))
+      .Times(0);
+
+  MPM.addPass(
+      createModuleToMachineFunctionPassAdaptor(FreeMachineFunctionPass()));
+  MPM.run(*M, MAM);
+}
+
+// Check that the Module -> MachineFunction adaptor and MachineFunction pass
+// manager properly call runAfterPassInvalidated.
+TEST_F(MachineFunctionCallbacksTest, InstrumentedFreeMFPass2) {
+  CallbacksHandle.registerPassInstrumentation(PIC);
+  // Non-mock instrumentation run here can safely be ignored.
+  CallbacksHandle.ignoreNonMockPassInstrumentation("test");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
+
+  ::testing::Sequence PISequence;
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforePass(HasNameRegex("FreeMachineFunctionPass"), HasName("test")))
+      .InSequence(PISequence)
+      .WillOnce(Return(true));
+  EXPECT_CALL(CallbacksHandle,
+              runBeforeNonSkippedPass(HasNameRegex("FreeMachineFunctionPass"),
+                                      HasName("test")))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle, runAfterPassInvalidated(
+                                   HasNameRegex("FreeMachineFunctionPass"), _))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("PassManager"), _))
+      .InSequence(PISequence);
+
+  // runAfterPass should not be called since the MachineFunction is no longer
+  // valid after FreeMachineFunctionPass.
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPass(HasNameRegex("FreeMachineFunctionPass"), _, _))
+      .Times(0);
+  EXPECT_CALL(CallbacksHandle, runAfterPass(HasNameRegex("PassManager"), _, _))
+      .Times(0);
+
+  MachineFunctionPassManager MFPM;
+  MFPM.addPass(FreeMachineFunctionPass());
+  MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+  MPM.run(*M, MAM);
 }
 
 } // end anonymous namespace
diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Attributes.cpp
index 474042a..db3c4de 100644
--- a/llvm/utils/TableGen/Attributes.cpp
+++ b/llvm/utils/TableGen/Attributes.cpp
@@ -87,7 +87,11 @@ void Attributes::emitFnAttrCompatCheck(raw_ostream &OS, bool IsStringAttr) {
 
   for (auto *Rule : CompatRules) {
     StringRef FuncName = Rule->getValueAsString("CompatFunc");
-    OS << "  Ret &= " << FuncName << "(Caller, Callee);\n";
+    OS << "  Ret &= " << FuncName << "(Caller, Callee";
+    StringRef AttrName = Rule->getValueAsString("AttrName");
+    if (!AttrName.empty())
+      OS << ", \"" << AttrName << "\"";
+    OS << ");\n";
   }
 
   OS << "\n";
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 768e805..d47df59 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -74,44 +74,32 @@ struct DXILOperationDesc {
 };
 } // end anonymous namespace
 
-// Convert DXIL type name string to dxil::ParameterKind
-//
-// @param typeNameStr Type name string
-// @return ParameterKind as defined in llvm/Support/DXILABI.h
-static ParameterKind getDXILTypeNameToKind(StringRef typeNameStr) {
-  return StringSwitch<ParameterKind>(typeNameStr)
-      .Case("voidTy", ParameterKind::VOID)
-      .Case("f16Ty", ParameterKind::HALF)
-      .Case("f32Ty", ParameterKind::FLOAT)
-      .Case("f64Ty", ParameterKind::DOUBLE)
-      .Case("i1Ty", ParameterKind::I1)
-      .Case("i8Ty", ParameterKind::I8)
-      .Case("i16Ty", ParameterKind::I16)
-      .Case("i32Ty", ParameterKind::I32)
-      .Case("i64Ty", ParameterKind::I64)
-      .Case("overloadTy", ParameterKind::OVERLOAD)
-      .Case("handleTy", ParameterKind::DXIL_HANDLE)
-      .Case("cbufferRetTy", ParameterKind::CBUFFER_RET)
-      .Case("resourceRetTy", ParameterKind::RESOURCE_RET)
-      .Default(ParameterKind::INVALID);
-}
-
-static ParameterKind parameterTypeNameToKind(StringRef Name) {
-  return StringSwitch<ParameterKind>(Name)
-      .Case("void", ParameterKind::VOID)
-      .Case("half", ParameterKind::HALF)
-      .Case("float", ParameterKind::FLOAT)
-      .Case("double", ParameterKind::DOUBLE)
-      .Case("i1", ParameterKind::I1)
-      .Case("i8", ParameterKind::I8)
-      .Case("i16", ParameterKind::I16)
-      .Case("i32", ParameterKind::I32)
-      .Case("i64", ParameterKind::I64)
-      .Case("$o", ParameterKind::OVERLOAD)
-      .Case("dx.types.Handle", ParameterKind::DXIL_HANDLE)
-      .Case("dx.types.CBufRet", ParameterKind::CBUFFER_RET)
-      .Case("dx.types.ResRet", ParameterKind::RESOURCE_RET)
-      .Default(ParameterKind::INVALID);
+/*!
+ Convert DXIL type name string to dxil::ParameterKind
+
+ @param typeNameStr Type name string
+ @return ParameterKind As defined in llvm/Support/DXILABI.h
+*/
+static ParameterKind lookupParameterKind(StringRef typeNameStr) {
+  auto paramKind = StringSwitch<ParameterKind>(typeNameStr)
+                       .Case("llvm_void_ty", ParameterKind::VOID)
+                       .Case("llvm_half_ty", ParameterKind::HALF)
+                       .Case("llvm_float_ty", ParameterKind::FLOAT)
+                       .Case("llvm_double_ty", ParameterKind::DOUBLE)
+                       .Case("llvm_i1_ty", ParameterKind::I1)
+                       .Case("llvm_i8_ty", ParameterKind::I8)
+                       .Case("llvm_i16_ty", ParameterKind::I16)
+                       .Case("llvm_i32_ty", ParameterKind::I32)
+                       .Case("llvm_i64_ty", ParameterKind::I64)
+                       .Case("llvm_anyfloat_ty", ParameterKind::OVERLOAD)
+                       .Case("llvm_anyint_ty", ParameterKind::OVERLOAD)
+                       .Case("dxil_handle_ty", ParameterKind::DXIL_HANDLE)
+                       .Case("dxil_cbuffer_ty", ParameterKind::CBUFFER_RET)
+                       .Case("dxil_resource_ty", ParameterKind::RESOURCE_RET)
+                       .Default(ParameterKind::INVALID);
+  assert(paramKind != ParameterKind::INVALID &&
+         "Unsupported DXIL Type specified");
+  return paramKind;
 }
 
 DXILOperationDesc::DXILOperationDesc(const Record *R) {
@@ -143,7 +131,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
 
   for (unsigned I = 0; I < OverloadTypeList->size(); ++I) {
     Record *R = OverloadTypeList->getElementAsRecord(I);
-    OverloadTypes.emplace_back(getDXILTypeNameToKind(R->getNameInitAsString()));
+    OverloadTypes.emplace_back(lookupParameterKind(R->getNameInitAsString()));
   }
   Attr = StringRef(R->getValue("Attribute")->getNameInitAsString());
 }
@@ -151,7 +139,8 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
 DXILParameter::DXILParameter(const Record *R) {
   Name = R->getValueAsString("Name");
   Pos = R->getValueAsInt("Pos");
-  Kind = parameterTypeNameToKind(R->getValueAsString("Type"));
+  Kind =
+      lookupParameterKind(R->getValue("ParamType")->getValue()->getAsString());
   if (R->getValue("Doc"))
     Doc = R->getValueAsString("Doc");
   IsConst = R->getValueAsBit("IsConstant");
@@ -296,10 +285,12 @@ static void emitDXILIntrinsicMap(std::vector<DXILOperationDesc> &Ops,
   OS << "\n";
 }
 
-// Convert operation attribute string to Attribute enum
-//
-// @param Attr string reference
-// @return std::string Attribute enum string
+/*!
+ Convert operation attribute string to Attribute enum
+
+ @param Attr string reference
+ @return std::string Attribute enum string
+ */
 static std::string emitDXILOperationAttr(StringRef Attr) {
   return StringSwitch<std::string>(Attr)
       .Case("ReadNone", "Attribute::ReadNone")
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 22a710651..36f437f0 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCDecoderOps.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -50,6 +51,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "decoder-emitter"
 
+extern cl::OptionCategory DisassemblerEmitterCat;
+
+cl::opt<bool> DecoderEmitterSuppressDuplicates(
+    "suppress-per-hwmode-duplicates",
+    cl::desc("Suppress duplication of instrs into per-HwMode decoder tables"),
+    cl::init(false), cl::cat(DisassemblerEmitterCat));
+
 namespace {
 
 STATISTIC(NumEncodings, "Number of encodings considered");
@@ -2496,10 +2504,15 @@ void DecoderEmitter::run(raw_ostream &o) {
       }
     }
     // This instruction is encoded the same on all HwModes. Emit it for all
-    // HwModes.
-    for (StringRef HwModeName : HwModeNames)
+    // HwModes by default, otherwise leave it in a single common table.
+    if (DecoderEmitterSuppressDuplicates) {
       NumberedEncodings.emplace_back(NumberedInstruction->TheDef,
-                                     NumberedInstruction, HwModeName);
+                                     NumberedInstruction, "AllModes");
+    } else {
+      for (StringRef HwModeName : HwModeNames)
+        NumberedEncodings.emplace_back(NumberedInstruction->TheDef,
+                                       NumberedInstruction, HwModeName);
+    }
   }
   for (const auto &NumberedAlias :
        RK.getAllDerivedDefinitions("AdditionalEncoding"))
diff --git a/llvm/utils/TableGen/DisassemblerEmitter.cpp b/llvm/utils/TableGen/DisassemblerEmitter.cpp
index ae6a8ef..2d653af 100644
--- a/llvm/utils/TableGen/DisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/DisassemblerEmitter.cpp
@@ -131,5 +131,7 @@ static void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
   EmitDecoder(Records, OS, PredicateNamespace);
 }
 
+cl::OptionCategory DisassemblerEmitterCat("Options for -gen-disassembler");
+
 static TableGen::Emitter::Opt X("gen-disassembler", EmitDisassembler,
                                 "Generate disassembler");
diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
index 98c2068..8cb60fd 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
@@ -51,6 +51,7 @@ executable("lldb-dap") {
     "ProgressEvent.cpp",
     "RunInTerminal.cpp",
     "SourceBreakpoint.cpp",
+    "Watchpoint.cpp",
     "lldb-dap.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
index c3cafe5..747ca8f 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
@@ -57,6 +57,7 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonFrameLowering.cpp",
     "HexagonGenExtract.cpp",
     "HexagonGenInsert.cpp",
+    "HexagonGenMemAbsolute.cpp",
     "HexagonGenMux.cpp",
     "HexagonGenPredicate.cpp",
     "HexagonHardwareLoops.cpp",
@@ -74,6 +75,7 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonOptAddrMode.cpp",
     "HexagonOptimizeSZextends.cpp",
     "HexagonPeephole.cpp",
+    "HexagonPostIncOpt.cpp",
     "HexagonRDFOpt.cpp",
     "HexagonRegisterInfo.cpp",
     "HexagonSelectionDAGInfo.cpp",
@@ -84,6 +86,7 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonTargetMachine.cpp",
     "HexagonTargetObjectFile.cpp",
     "HexagonTargetTransformInfo.cpp",
+    "HexagonTfrCleanup.cpp",
     "HexagonVExtract.cpp",
     "HexagonVLIWPacketizer.cpp",
     "HexagonVectorCombine.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
index 8aff3e4..df71d08 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
@@ -40,6 +40,7 @@ unittest("CodeGenTests") {
     "ScalableVectorMVTsTest.cpp",
     "SchedBoundary.cpp",
     "SelectionDAGAddressAnalysisTest.cpp",
+    "SelectionDAGPatternMatchTest.cpp",
     "TargetOptionsTest.cpp",
     "TestAsmPrinter.cpp",
     "TypeTraitsTest.cpp",
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 67bb22d..dc63fda 100755
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -287,7 +287,16 @@ ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit
 ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
 ninja package || exit /b 1
+
+:: generate tarball with install toolchain only off
+set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
+cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^
+  -DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1
+ninja install || exit /b 1
+:: check llvm_config is present & returns something
+%build_dir%/%filename%/bin/llvm-config.exe --bindir || exit /b 1
 cd ..
+7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz
 
 exit /b 0
 ::==============================================================================
author	Florian Mayer <fmayer@google.com>	2024-02-23 11:31:14 -0800
committer	Florian Mayer <fmayer@google.com>	2024-02-23 11:31:14 -0800
commit	886b4bc97b0ed5a5e041a0117a584182fc7989c1 (patch)
tree	43cdc0e15e12c298c09251dda38e834e7e778049 /llvm
parent	af8afe08ee20a04b2ccb363cac66aa02cfaecd02 (diff)
parent	8d536f83545f071948888983e2db25ce23a8302d (diff)
download	llvm-886b4bc97b0ed5a5e041a0117a584182fc7989c1.zip llvm-886b4bc97b0ed5a5e041a0117a584182fc7989c1.tar.gz llvm-886b4bc97b0ed5a5e041a0117a584182fc7989c1.tar.bz2